From 925fb058c84551cb38ebf1465ab79881af57f136 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 12 Nov 2025 21:53:11 -0800
Subject: [PATCH 01/29] [RISCV][GISel] Fallback to SelectionDAG for vleff
 intrinsics. (#167776)

Supporting this in GISel requires multiple changes to IRTranslator to
support aggregate returns containing scalable vectors and non-scalable
types. Falling back is the quickest way to fix the crash.

Fixes #167618
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp        |  6 ++++--
 .../RISCV/GlobalISel/irtranslator/vec-vleff.ll     | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-vleff.ll
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d086a2a4a3057..28fe76bb35b0c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -25401,8 +25401,10 @@ bool RISCVTargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
   if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
     // Mark RVV intrinsic as supported.
     if (RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(II->getIntrinsicID())) {
-      // GISel doesn't support tuple types yet.
-      if (Inst.getType()->isRISCVVectorTupleTy())
+      // GISel doesn't support tuple types yet. It also doesn't suport returning
+      // a struct containing a scalable vector like vleff.
+      if (Inst.getType()->isRISCVVectorTupleTy() ||
+          Inst.getType()->isStructTy())
         return true;
 
       for (unsigned i = 0; i < II->arg_size(); ++i)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-vleff.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-vleff.ll
new file mode 100644
index 0000000000000..e88e6953b80f4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-vleff.ll
@@ -0,0 +1,14 @@
+; RUN: not --crash llc -global-isel -mtriple=riscv64 -mattr=+v -filetype=null %s 2>&1 | FileCheck %s
+
+; Intrinsics returning structs and extractvalue of scalable vector are not
+; supported yet.
+define <vscale x 1 x i64> @intrinsic_vleff_v_nxv1i64_nxv1i64(ptr %0, i64 %1, ptr %2) nounwind {
+entry:
+  %a = call { <vscale x 1 x i64>, i64 } @llvm.riscv.vleff.nxv1i64(<vscale x 1 x i64> poison, ptr %0, i64 %1)
+  %b = extractvalue { <vscale x 1 x i64>, i64 } %a, 0
+  %c = extractvalue { <vscale x 1 x i64>, i64 } %a, 1
+  store i64 %c, ptr %2
+  ret <vscale x 1 x i64> %b
+}
+
+; CHECK: LLVM ERROR: unable to translate instruction: call llvm.riscv.vleff.nxv1i64.i64.p0 (in function: intrinsic_vleff_v_nxv1i64_nxv1i64)

From 13251f5f0663ae9db776766333cf7fcc80d2a534 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 12 Nov 2025 22:15:00 -0800
Subject: [PATCH 02/29] [DWARFCFIChecker] Use MCRegister instead of MCPhysReg.
 NFC (#167823)

---
 llvm/lib/DWARFCFIChecker/Registers.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/DWARFCFIChecker/Registers.h b/llvm/lib/DWARFCFIChecker/Registers.h
index 915250de5aeae..e5851a28418d6 100644
--- a/llvm/lib/DWARFCFIChecker/Registers.h
+++ b/llvm/lib/DWARFCFIChecker/Registers.h
@@ -23,7 +23,7 @@ namespace llvm {
 /// This analysis only keeps track and cares about super registers, not the
 /// subregisters. All reads from/writes to subregisters are considered the
 /// same operation to super registers.
-inline bool isSuperReg(const MCRegisterInfo *MCRI, MCPhysReg Reg) {
+inline bool isSuperReg(const MCRegisterInfo *MCRI, MCRegister Reg) {
   return MCRI->superregs(Reg).empty();
 }
 
@@ -31,9 +31,9 @@ inline SmallVector<MCPhysReg> getSuperRegs(const MCRegisterInfo *MCRI) {
   SmallVector<MCPhysReg> SuperRegs;
   for (auto &&RegClass : MCRI->regclasses())
     for (unsigned I = 0; I < RegClass.getNumRegs(); I++) {
-      MCPhysReg Reg = RegClass.getRegister(I);
+      MCRegister Reg = RegClass.getRegister(I);
       if (isSuperReg(MCRI, Reg))
-        SuperRegs.push_back(Reg);
+        SuperRegs.push_back(Reg.id());
     }
 
   sort(SuperRegs.begin(), SuperRegs.end());
@@ -49,7 +49,7 @@ inline SmallVector<MCPhysReg> getTrackingRegs(const MCRegisterInfo *MCRI) {
   return TrackingRegs;
 }
 
-inline MCPhysReg getSuperReg(const MCRegisterInfo *MCRI, MCPhysReg Reg) {
+inline MCRegister getSuperReg(const MCRegisterInfo *MCRI, MCRegister Reg) {
   if (isSuperReg(MCRI, Reg))
     return Reg;
   for (auto SuperReg : MCRI->superregs(Reg))

From 99a726ea51d31eef39a5defe05a1b9e30cf9f622 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 12 Nov 2025 22:56:57 -0800
Subject: [PATCH 03/29] [SelectionDAGISel] Const correct ChainNodesMatched
 argument to HandleMergeInputChains. NFC (#167807)

---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index e78dfb12505c7..e7d4c4b88191b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2782,7 +2782,7 @@ void SelectionDAGISel::UpdateChains(
 /// induce cycles in the DAG) and if so, creating a TokenFactor node. that will
 /// be used as the input node chain for the generated nodes.
 static SDValue
-HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
+HandleMergeInputChains(const SmallVectorImpl<SDNode *> &ChainNodesMatched,
                        SelectionDAG *CurDAG) {
 
   SmallPtrSet<const SDNode *, 16> Visited;

From 140e07c862ccf02c3c7412187ce79139ac45f017 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Thu, 13 Nov 2025 02:02:21 -0500
Subject: [PATCH 04/29] Revert "Reland yet again: [mlir] Add FP software
 implementation lowering pass: `arith-to-apfloat`" (#167834)

Reverts llvm/llvm-project#167608

Broken builder https://lab.llvm.org/buildbot/#/builders/52/builds/12781
---
 .../ArithToAPFloat/ArithToAPFloat.h           |  21 ---
 mlir/include/mlir/Conversion/Passes.h         |   1 -
 mlir/include/mlir/Conversion/Passes.td        |  15 --
 mlir/include/mlir/Dialect/Func/Utils/Utils.h  |   7 -
 .../mlir/Dialect/LLVMIR/FunctionCallUtils.h   |   4 -
 .../ArithToAPFloat/ArithToAPFloat.cpp         | 163 ------------------
 .../Conversion/ArithToAPFloat/CMakeLists.txt  |  18 --
 .../Conversion/ArithToLLVM/ArithToLLVM.cpp    |   1 -
 mlir/lib/Conversion/CMakeLists.txt            |   1 -
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |  14 --
 mlir/lib/Dialect/Func/Utils/Utils.cpp         |  25 ---
 .../Dialect/LLVMIR/IR/FunctionCallUtils.cpp   |  11 --
 mlir/lib/ExecutionEngine/APFloatWrappers.cpp  |  89 ----------
 mlir/lib/ExecutionEngine/CMakeLists.txt       |  17 --
 .../ArithToApfloat/arith-to-apfloat.mlir      | 128 --------------
 .../Arith/CPU/test-apfloat-emulation.mlir     |  36 ----
 mlir/test/lit.cfg.py                          |   1 -
 17 files changed, 552 deletions(-)
 delete mode 100644 mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
 delete mode 100644 mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
 delete mode 100644 mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
 delete mode 100644 mlir/lib/ExecutionEngine/APFloatWrappers.cpp
 delete mode 100644 mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
 delete mode 100644 mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir

diff --git a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
deleted file mode 100644
index 64a42a228199e..0000000000000
--- a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
+++ /dev/null
@@ -1,21 +0,0 @@
-//===- ArithToAPFloat.h - Arith to APFloat impl conversion ---*- C++ ----*-===//
-//
-// Part of the APFloat Project, under the Apache License v2.0 with APFloat
-// Exceptions. See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH APFloat-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
-#define MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
-
-#include <memory>
-
-namespace mlir {
-class Pass;
-
-#define GEN_PASS_DECL_ARITHTOAPFLOATCONVERSIONPASS
-#include "mlir/Conversion/Passes.h.inc"
-} // namespace mlir
-
-#endif // MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 82bdfd02661a6..40d866ec7bf10 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -12,7 +12,6 @@
 #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h"
-#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
 #include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h"
 #include "mlir/Conversion/ArithToEmitC/ArithToEmitCPass.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 79bc380dbcb7a..70e3e45c225db 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -186,21 +186,6 @@ def ArithToLLVMConversionPass : Pass<"convert-arith-to-llvm"> {
   ];
 }
 
-//===----------------------------------------------------------------------===//
-// ArithToAPFloat
-//===----------------------------------------------------------------------===//
-
-def ArithToAPFloatConversionPass
-    : Pass<"convert-arith-to-apfloat", "ModuleOp"> {
-  let summary = "Convert Arith ops to APFloat runtime library calls";
-  let description = [{
-    This pass converts supported Arith ops to APFloat-based runtime library
-    calls (APFloatWrappers.cpp). APFloat is a software implementation of
-    floating-point arithmetic operations.
-  }];
-  let dependentDialects = ["func::FuncDialect"];
-}
-
 //===----------------------------------------------------------------------===//
 // ArithToSPIRV
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Func/Utils/Utils.h b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
index 00d50874a2e8d..3576126a487ac 100644
--- a/mlir/include/mlir/Dialect/Func/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
@@ -60,13 +60,6 @@ mlir::FailureOr<std::pair<mlir::func::FuncOp, mlir::func::CallOp>>
 deduplicateArgsOfFuncOp(mlir::RewriterBase &rewriter, mlir::func::FuncOp funcOp,
                         mlir::ModuleOp moduleOp);
 
-/// Look up a FuncOp with signature `resultTypes`(`paramTypes`)` and name
-/// `name`. Return a failure if the FuncOp is found but with a different
-/// signature.
-FailureOr<FuncOp> lookupFnDecl(SymbolOpInterface symTable, StringRef name,
-                               FunctionType funcT,
-                               SymbolTableCollection *symbolTables = nullptr);
-
 } // namespace func
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
index b09d32022e348..8ad9ed18acebd 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
@@ -52,10 +52,6 @@ lookupOrCreatePrintF32Fn(OpBuilder &b, Operation *moduleOp,
 FailureOr<LLVM::LLVMFuncOp>
 lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
                          SymbolTableCollection *symbolTables = nullptr);
-FailureOr<LLVM::LLVMFuncOp>
-lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
-                             SymbolTableCollection *symbolTables = nullptr);
-
 /// Declares a function to print a C-string.
 /// If a custom runtime function is defined via `runtimeFunctionName`, it must
 /// have the signature void(char const*). The default function is `printString`.
diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
deleted file mode 100644
index 699edb188a70a..0000000000000
--- a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-//===- ArithToAPFloat.cpp - Arithmetic to APFloat Conversion --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Func/Utils/Utils.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Verifier.h"
-#include "mlir/Transforms/WalkPatternRewriteDriver.h"
-
-namespace mlir {
-#define GEN_PASS_DEF_ARITHTOAPFLOATCONVERSIONPASS
-#include "mlir/Conversion/Passes.h.inc"
-} // namespace mlir
-
-using namespace mlir;
-using namespace mlir::func;
-
-static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable,
-                           StringRef name, FunctionType funcT, bool setPrivate,
-                           SymbolTableCollection *symbolTables = nullptr) {
-  OpBuilder::InsertionGuard g(b);
-  assert(!symTable->getRegion(0).empty() && "expected non-empty region");
-  b.setInsertionPointToStart(&symTable->getRegion(0).front());
-  FuncOp funcOp = FuncOp::create(b, symTable->getLoc(), name, funcT);
-  if (setPrivate)
-    funcOp.setPrivate();
-  if (symbolTables) {
-    SymbolTable &symbolTable = symbolTables->getSymbolTable(symTable);
-    symbolTable.insert(funcOp, symTable->getRegion(0).front().begin());
-  }
-  return funcOp;
-}
-
-/// Helper function to look up or create the symbol for a runtime library
-/// function for a binary arithmetic operation.
-///
-/// Parameter 1: APFloat semantics
-/// Parameter 2: Left-hand side operand
-/// Parameter 3: Right-hand side operand
-///
-/// This function will return a failure if the function is found but has an
-/// unexpected signature.
-///
-static FailureOr<FuncOp>
-lookupOrCreateBinaryFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name,
-                       SymbolTableCollection *symbolTables = nullptr) {
-  auto i32Type = IntegerType::get(symTable->getContext(), 32);
-  auto i64Type = IntegerType::get(symTable->getContext(), 64);
-
-  std::string funcName = (llvm::Twine("_mlir_apfloat_") + name).str();
-  FunctionType funcT =
-      FunctionType::get(b.getContext(), {i32Type, i64Type, i64Type}, {i64Type});
-  FailureOr<FuncOp> func =
-      lookupFnDecl(symTable, funcName, funcT, symbolTables);
-  // Failed due to type mismatch.
-  if (failed(func))
-    return func;
-  // Successfully matched existing decl.
-  if (*func)
-    return *func;
-
-  return createFnDecl(b, symTable, funcName, funcT,
-                      /*setPrivate=*/true, symbolTables);
-}
-
-/// Rewrite a binary arithmetic operation to an APFloat function call.
-template <typename OpTy>
-struct BinaryArithOpToAPFloatConversion final : OpRewritePattern<OpTy> {
-  BinaryArithOpToAPFloatConversion(MLIRContext *context,
-                                   const char *APFloatName,
-                                   SymbolOpInterface symTable,
-                                   PatternBenefit benefit = 1)
-      : OpRewritePattern<OpTy>(context, benefit), symTable(symTable),
-        APFloatName(APFloatName) {};
-
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const override {
-    // Get APFloat function from runtime library.
-    FailureOr<FuncOp> fn =
-        lookupOrCreateBinaryFn(rewriter, symTable, APFloatName);
-    if (failed(fn))
-      return fn;
-
-    rewriter.setInsertionPoint(op);
-    // Cast operands to 64-bit integers.
-    Location loc = op.getLoc();
-    auto floatTy = cast<FloatType>(op.getType());
-    auto intWType = rewriter.getIntegerType(floatTy.getWidth());
-    auto int64Type = rewriter.getI64Type();
-    Value lhsBits = arith::ExtUIOp::create(
-        rewriter, loc, int64Type,
-        arith::BitcastOp::create(rewriter, loc, intWType, op.getLhs()));
-    Value rhsBits = arith::ExtUIOp::create(
-        rewriter, loc, int64Type,
-        arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs()));
-
-    // Call APFloat function.
-    int32_t sem =
-        llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
-    Value semValue = arith::ConstantOp::create(
-        rewriter, loc, rewriter.getI32Type(),
-        rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
-    SmallVector<Value> params = {semValue, lhsBits, rhsBits};
-    auto resultOp =
-        func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()),
-                             SymbolRefAttr::get(*fn), params);
-
-    // Truncate result to the original width.
-    Value truncatedBits = arith::TruncIOp::create(rewriter, loc, intWType,
-                                                  resultOp->getResult(0));
-    rewriter.replaceOp(
-        op, arith::BitcastOp::create(rewriter, loc, floatTy, truncatedBits));
-    return success();
-  }
-
-  SymbolOpInterface symTable;
-  const char *APFloatName;
-};
-
-namespace {
-struct ArithToAPFloatConversionPass final
-    : impl::ArithToAPFloatConversionPassBase<ArithToAPFloatConversionPass> {
-  using Base::Base;
-
-  void runOnOperation() override;
-};
-
-void ArithToAPFloatConversionPass::runOnOperation() {
-  MLIRContext *context = &getContext();
-  RewritePatternSet patterns(context);
-  patterns.add<BinaryArithOpToAPFloatConversion<arith::AddFOp>>(context, "add",
-                                                                getOperation());
-  patterns.add<BinaryArithOpToAPFloatConversion<arith::SubFOp>>(
-      context, "subtract", getOperation());
-  patterns.add<BinaryArithOpToAPFloatConversion<arith::MulFOp>>(
-      context, "multiply", getOperation());
-  patterns.add<BinaryArithOpToAPFloatConversion<arith::DivFOp>>(
-      context, "divide", getOperation());
-  patterns.add<BinaryArithOpToAPFloatConversion<arith::RemFOp>>(
-      context, "remainder", getOperation());
-  LogicalResult result = success();
-  ScopedDiagnosticHandler scopedHandler(context, [&result](Diagnostic &diag) {
-    if (diag.getSeverity() == DiagnosticSeverity::Error) {
-      result = failure();
-    }
-    // NB: if you don't return failure, no other diag handlers will fire (see
-    // mlir/lib/IR/Diagnostics.cpp:DiagnosticEngineImpl::emit).
-    return failure();
-  });
-  walkAndApplyPatterns(getOperation(), std::move(patterns));
-  if (failed(result))
-    return signalPassFailure();
-}
-} // namespace
diff --git a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
deleted file mode 100644
index b5ec49c087163..0000000000000
--- a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-add_mlir_conversion_library(MLIRArithToAPFloat
-  ArithToAPFloat.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ArithToLLVM
-
-  DEPENDS
-  MLIRConversionPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MLIRArithDialect
-  MLIRArithTransforms
-  MLIRFuncDialect
-  MLIRFuncUtils
-  )
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index f2bacc3399144..b6099902cc337 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -14,7 +14,6 @@
 #include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/TypeUtilities.h"
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index 613dc6d242ceb..bebf1b8fff3f9 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -2,7 +2,6 @@ add_subdirectory(AffineToStandard)
 add_subdirectory(AMDGPUToROCDL)
 add_subdirectory(ArithCommon)
 add_subdirectory(ArithToAMDGPU)
-add_subdirectory(ArithToAPFloat)
 add_subdirectory(ArithToArmSME)
 add_subdirectory(ArithToEmitC)
 add_subdirectory(ArithToLLVM)
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index c747e1b59558a..69a317ecd101f 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1654,20 +1654,6 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
           return failure();
         }
       }
-    } else if (auto floatTy = dyn_cast<FloatType>(printType)) {
-      // Print other floating-point types using the APFloat runtime library.
-      int32_t sem =
-          llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
-      Value semValue = LLVM::ConstantOp::create(
-          rewriter, loc, rewriter.getI32Type(),
-          rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
-      Value floatBits =
-          LLVM::ZExtOp::create(rewriter, loc, rewriter.getI64Type(), value);
-      printer =
-          LLVM::lookupOrCreateApFloatPrintFn(rewriter, parent, symbolTables);
-      emitCall(rewriter, loc, printer.value(),
-               ValueRange({semValue, floatBits}));
-      return success();
     } else {
       return failure();
     }
diff --git a/mlir/lib/Dialect/Func/Utils/Utils.cpp b/mlir/lib/Dialect/Func/Utils/Utils.cpp
index d6dfd0229963c..b4cb0932ef631 100644
--- a/mlir/lib/Dialect/Func/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Func/Utils/Utils.cpp
@@ -254,28 +254,3 @@ func::deduplicateArgsOfFuncOp(RewriterBase &rewriter, func::FuncOp funcOp,
 
   return std::make_pair(*newFuncOpOrFailure, newCallOp);
 }
-
-FailureOr<func::FuncOp>
-func::lookupFnDecl(SymbolOpInterface symTable, StringRef name,
-                   FunctionType funcT, SymbolTableCollection *symbolTables) {
-  FuncOp func;
-  if (symbolTables) {
-    func = symbolTables->lookupSymbolIn<FuncOp>(
-        symTable, StringAttr::get(symTable->getContext(), name));
-  } else {
-    func = llvm::dyn_cast_or_null<FuncOp>(
-        SymbolTable::lookupSymbolIn(symTable, name));
-  }
-
-  if (!func)
-    return func;
-
-  mlir::FunctionType foundFuncT = func.getFunctionType();
-  // Assert the signature of the found function is same as expected
-  if (funcT != foundFuncT) {
-    return func.emitError("matched function '")
-           << name << "' but with different type: " << foundFuncT
-           << " (expected " << funcT << ")";
-  }
-  return func;
-}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index 160b6ae89215c..feaffa34897b6 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -30,7 +30,6 @@ static constexpr llvm::StringRef kPrintF16 = "printF16";
 static constexpr llvm::StringRef kPrintBF16 = "printBF16";
 static constexpr llvm::StringRef kPrintF32 = "printF32";
 static constexpr llvm::StringRef kPrintF64 = "printF64";
-static constexpr llvm::StringRef kPrintApFloat = "printApFloat";
 static constexpr llvm::StringRef kPrintString = "printString";
 static constexpr llvm::StringRef kPrintOpen = "printOpen";
 static constexpr llvm::StringRef kPrintClose = "printClose";
@@ -161,16 +160,6 @@ mlir::LLVM::lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
       LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
-FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
-                                         SymbolTableCollection *symbolTables) {
-  return lookupOrCreateReservedFn(
-      b, moduleOp, kPrintApFloat,
-      {IntegerType::get(moduleOp->getContext(), 32),
-       IntegerType::get(moduleOp->getContext(), 64)},
-      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
-}
-
 static LLVM::LLVMPointerType getCharPtr(MLIRContext *context) {
   return LLVM::LLVMPointerType::get(context);
 }
diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
deleted file mode 100644
index 0a05f7369e556..0000000000000
--- a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-//===- APFloatWrappers.cpp - Software Implementation of FP Arithmetics --- ===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file exposes the APFloat infrastructure to MLIR programs as a runtime
-// library. APFloat is a software implementation of floating point arithmetics.
-//
-// On the MLIR side, floating-point values must be bitcasted to 64-bit integers
-// before calling a runtime function. If a floating-point type has less than
-// 64 bits, it must be zero-extended to 64 bits after bitcasting it to an
-// integer.
-//
-// Runtime functions receive the floating-point operands of the arithmeic
-// operation in the form of 64-bit integers, along with the APFloat semantics
-// in the form of a 32-bit integer, which will be interpreted as an
-// APFloatBase::Semantics enum value.
-//
-#include "llvm/ADT/APFloat.h"
-
-#ifdef _WIN32
-#ifndef MLIR_APFLOAT_WRAPPERS_EXPORT
-#ifdef mlir_apfloat_wrappers_EXPORTS
-// We are building this library
-#define MLIR_APFLOAT_WRAPPERS_EXPORT __declspec(dllexport)
-#else
-// We are using this library
-#define MLIR_APFLOAT_WRAPPERS_EXPORT __declspec(dllimport)
-#endif // mlir_apfloat_wrappers_EXPORTS
-#endif // MLIR_APFLOAT_WRAPPERS_EXPORT
-#else
-// Non-windows: use visibility attributes.
-#define MLIR_APFLOAT_WRAPPERS_EXPORT __attribute__((visibility("default")))
-#endif // _WIN32
-
-/// Binary operations without rounding mode.
-#define APFLOAT_BINARY_OP(OP)                                                  \
-  MLIR_APFLOAT_WRAPPERS_EXPORT int64_t _mlir_apfloat_##OP(                     \
-      int32_t semantics, uint64_t a, uint64_t b) {                             \
-    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
-        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
-    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
-    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
-    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
-    lhs.OP(rhs);                                                               \
-    return lhs.bitcastToAPInt().getZExtValue();                                \
-  }
-
-/// Binary operations with rounding mode.
-#define APFLOAT_BINARY_OP_ROUNDING_MODE(OP, ROUNDING_MODE)                     \
-  MLIR_APFLOAT_WRAPPERS_EXPORT int64_t _mlir_apfloat_##OP(                     \
-      int32_t semantics, uint64_t a, uint64_t b) {                             \
-    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
-        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
-    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
-    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
-    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
-    lhs.OP(rhs, ROUNDING_MODE);                                                \
-    return lhs.bitcastToAPInt().getZExtValue();                                \
-  }
-
-extern "C" {
-
-#define BIN_OPS_WITH_ROUNDING(X)                                               \
-  X(add, llvm::RoundingMode::NearestTiesToEven)                                \
-  X(subtract, llvm::RoundingMode::NearestTiesToEven)                           \
-  X(multiply, llvm::RoundingMode::NearestTiesToEven)                           \
-  X(divide, llvm::RoundingMode::NearestTiesToEven)
-
-BIN_OPS_WITH_ROUNDING(APFLOAT_BINARY_OP_ROUNDING_MODE)
-#undef BIN_OPS_WITH_ROUNDING
-#undef APFLOAT_BINARY_OP_ROUNDING_MODE
-
-APFLOAT_BINARY_OP(remainder)
-
-#undef APFLOAT_BINARY_OP
-
-MLIR_APFLOAT_WRAPPERS_EXPORT void printApFloat(int32_t semantics, uint64_t a) {
-  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
-      static_cast<llvm::APFloatBase::Semantics>(semantics));
-  unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);
-  llvm::APFloat x(sem, llvm::APInt(bitWidth, a));
-  double d = x.convertToDouble();
-  fprintf(stdout, "%lg", d);
-}
-}
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 0045675bcb448..fdeb4dacf9278 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -2,7 +2,6 @@
 # is a big dependency which most don't need.
 
 set(LLVM_OPTIONAL_SOURCES
-  APFloatWrappers.cpp
   ArmRunnerUtils.cpp
   ArmSMEStubs.cpp
   AsyncRuntime.cpp
@@ -168,20 +167,6 @@ if(LLVM_ENABLE_PIC)
   set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17)
   target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS)
 
-  add_mlir_library(mlir_apfloat_wrappers
-    SHARED
-    APFloatWrappers.cpp
-
-    EXCLUDE_FROM_LIBMLIR
-    )
-  set_target_properties(
-    mlir_apfloat_wrappers
-    PROPERTIES CXX_STANDARD 17
-               CXX_VISIBILITY_PRESET hidden
-               VISIBILITY_INLINES_HIDDEN ON
-  )
-  target_compile_definitions(mlir_apfloat_wrappers PRIVATE mlir_apfloat_wrappers_EXPORTS)
-
   add_subdirectory(SparseTensor)
 
   add_mlir_library(mlir_c_runner_utils
@@ -192,7 +177,6 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
-    mlir_apfloat_wrappers
     mlir_float16_utils
     MLIRSparseTensorEnums
     MLIRSparseTensorRuntime
@@ -207,7 +191,6 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
-    mlir_apfloat_wrappers
     mlir_float16_utils
   )
   target_compile_definitions(mlir_runner_utils PRIVATE mlir_runner_utils_EXPORTS)
diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
deleted file mode 100644
index 797f42c37a26f..0000000000000
--- a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
+++ /dev/null
@@ -1,128 +0,0 @@
-// RUN: mlir-opt %s --convert-arith-to-apfloat -split-input-file -verify-diagnostics | FileCheck %s
-
-// CHECK-LABEL:   func.func private @_mlir_apfloat_add(i32, i64, i64) -> i64
-
-// CHECK-LABEL:   func.func @foo() -> f8E4M3FN {
-// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 2.250000e+00 : f8E4M3FN
-// CHECK:           return %[[CONSTANT_0]] : f8E4M3FN
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @bar() -> f6E3M2FN {
-// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 3.000000e+00 : f6E3M2FN
-// CHECK:           return %[[CONSTANT_0]] : f6E3M2FN
-// CHECK:         }
-
-// Illustrate that both f8E4M3FN and f6E3M2FN calling the same _mlir_apfloat_add is fine
-// because each gets its own semantics enum and gets bitcast/extui/trunci to its own width.
-// CHECK-LABEL:   func.func @full_example() {
-// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1.375000e+00 : f8E4M3FN
-// CHECK:           %[[VAL_0:.*]] = call @foo() : () -> f8E4M3FN
-// CHECK:           %[[BITCAST_0:.*]] = arith.bitcast %[[CONSTANT_0]] : f8E4M3FN to i8
-// CHECK:           %[[EXTUI_0:.*]] = arith.extui %[[BITCAST_0]] : i8 to i64
-// CHECK:           %[[BITCAST_1:.*]] = arith.bitcast %[[VAL_0]] : f8E4M3FN to i8
-// CHECK:           %[[EXTUI_1:.*]] = arith.extui %[[BITCAST_1]] : i8 to i64
-//                  // fltSemantics semantics for f8E4M3FN
-// CHECK:           %[[CONSTANT_1:.*]] = arith.constant 10 : i32
-// CHECK:           %[[VAL_1:.*]] = call @_mlir_apfloat_add(%[[CONSTANT_1]], %[[EXTUI_0]], %[[EXTUI_1]]) : (i32, i64, i64) -> i64
-// CHECK:           %[[TRUNCI_0:.*]] = arith.trunci %[[VAL_1]] : i64 to i8
-// CHECK:           %[[BITCAST_2:.*]] = arith.bitcast %[[TRUNCI_0]] : i8 to f8E4M3FN
-// CHECK:           vector.print %[[BITCAST_2]] : f8E4M3FN
-
-// CHECK:           %[[CONSTANT_2:.*]] = arith.constant 2.500000e+00 : f6E3M2FN
-// CHECK:           %[[VAL_2:.*]] = call @bar() : () -> f6E3M2FN
-// CHECK:           %[[BITCAST_3:.*]] = arith.bitcast %[[CONSTANT_2]] : f6E3M2FN to i6
-// CHECK:           %[[EXTUI_2:.*]] = arith.extui %[[BITCAST_3]] : i6 to i64
-// CHECK:           %[[BITCAST_4:.*]] = arith.bitcast %[[VAL_2]] : f6E3M2FN to i6
-// CHECK:           %[[EXTUI_3:.*]] = arith.extui %[[BITCAST_4]] : i6 to i64
-//                  // fltSemantics semantics for f6E3M2FN
-// CHECK:           %[[CONSTANT_3:.*]] = arith.constant 16 : i32
-// CHECK:           %[[VAL_3:.*]] = call @_mlir_apfloat_add(%[[CONSTANT_3]], %[[EXTUI_2]], %[[EXTUI_3]]) : (i32, i64, i64) -> i64
-// CHECK:           %[[TRUNCI_1:.*]] = arith.trunci %[[VAL_3]] : i64 to i6
-// CHECK:           %[[BITCAST_5:.*]] = arith.bitcast %[[TRUNCI_1]] : i6 to f6E3M2FN
-// CHECK:           vector.print %[[BITCAST_5]] : f6E3M2FN
-// CHECK:           return
-// CHECK:         }
-
-// Put rhs into separate function so that it won't be constant-folded.
-func.func @foo() -> f8E4M3FN {
-  %cst = arith.constant 2.2 : f8E4M3FN
-  return %cst : f8E4M3FN
-}
-
-func.func @bar() -> f6E3M2FN {
-  %cst = arith.constant 3.2 : f6E3M2FN
-  return %cst : f6E3M2FN
-}
-
-func.func @full_example() {
-  %a = arith.constant 1.4 : f8E4M3FN
-  %b = func.call @foo() : () -> (f8E4M3FN)
-  %c = arith.addf %a, %b : f8E4M3FN
-  vector.print %c : f8E4M3FN
-
-  %d = arith.constant 2.4 : f6E3M2FN
-  %e = func.call @bar() : () -> (f6E3M2FN)
-  %f = arith.addf %d, %e : f6E3M2FN
-  vector.print %f : f6E3M2FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @_mlir_apfloat_add(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @_mlir_apfloat_add(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// Test decl collision (different type)
-// expected-error@+1{{matched function '_mlir_apfloat_add' but with different type: '(i32, i32, f32) -> index' (expected '(i32, i64, i64) -> i64')}}
-func.func private @_mlir_apfloat_add(i32, i32, f32) -> index
-func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @_mlir_apfloat_subtract(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @_mlir_apfloat_subtract(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.subf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @_mlir_apfloat_multiply(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @_mlir_apfloat_multiply(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.mulf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @_mlir_apfloat_divide(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @_mlir_apfloat_divide(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.divf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @_mlir_apfloat_remainder(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @_mlir_apfloat_remainder(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @remf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.remf %arg0, %arg1 : f4E2M1FN
-  return
-}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
deleted file mode 100644
index 2768afe0834b5..0000000000000
--- a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
+++ /dev/null
@@ -1,36 +0,0 @@
-// Case 1: All floating-point arithmetics is lowered through APFloat.
-// RUN: mlir-opt %s --convert-arith-to-apfloat --convert-to-llvm | \
-// RUN: mlir-runner -e entry --entry-point-result=void \
-// RUN:             --shared-libs=%mlir_c_runner_utils \
-// RUN:             --shared-libs=%mlir_apfloat_wrappers | FileCheck %s
-
-// Case 2: Only unsupported arithmetics (f8E4M3FN) is lowered through APFloat.
-//         Arithmetics on f32 is lowered directly to LLVM.
-// RUN: mlir-opt %s --convert-to-llvm --convert-arith-to-apfloat \
-// RUN:          --convert-to-llvm --reconcile-unrealized-casts | \
-// RUN: mlir-runner -e entry --entry-point-result=void \
-// RUN:             --shared-libs=%mlir_c_runner_utils \
-// RUN:             --shared-libs=%mlir_apfloat_wrappers | FileCheck %s
-
-// Put rhs into separate function so that it won't be constant-folded.
-func.func @foo() -> (f8E4M3FN, f32) {
-  %cst1 = arith.constant 2.2 : f8E4M3FN
-  %cst2 = arith.constant 2.2 : f32
-  return %cst1, %cst2 : f8E4M3FN, f32
-}
-
-func.func @entry() {
-  %a1 = arith.constant 1.4 : f8E4M3FN
-  %a2 = arith.constant 1.4 : f32
-  %b1, %b2 = func.call @foo() : () -> (f8E4M3FN, f32)
-  %c1 = arith.addf %a1, %b1 : f8E4M3FN  // not supported by LLVM
-  %c2 = arith.addf %a2, %b2 : f32       // supported by LLVM
-
-  // CHECK: 3.5
-  vector.print %c1 : f8E4M3FN
-
-  // CHECK: 3.6
-  vector.print %c2 : f32
-
-  return
-}
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 4a38ed605be0c..6ff12d66523f5 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -208,7 +208,6 @@ def find_real_python_interpreter():
     add_runtime("mlir_c_runner_utils"),
     add_runtime("mlir_async_runtime"),
     add_runtime("mlir_float16_utils"),
-    add_runtime("mlir_apfloat_wrappers"),
     "mlir-linalg-ods-yaml-gen",
     "mlir-reduce",
     "mlir-pdll",

From 147e615a658db0c0679ff2814dbc1bc59ded5160 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Thu, 13 Nov 2025 07:17:07 +0000
Subject: [PATCH 05/29] [CI] Fix misspelled runtimes_targets variable (#167696)

This was preventing check-compiler-rt from actually running when we
touched a project that was supposed to cause compiler-rt to be tested.
---
 .ci/monolithic-windows.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index beaed71f49f65..7b926b87f3623 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -55,7 +55,7 @@ start-group "ninja"
 ninja -C "${BUILD_DIR}" -k 0 ${targets} |& tee ninja.log
 cp ${BUILD_DIR}/.ninja_log ninja.ninja_log
 
-if [[ "${runtime_targets}" != "" ]]; then
+if [[ "${runtimes_targets}" != "" ]]; then
   start-group "ninja runtimes"
   
   ninja -C "${BUILD_DIR}" -k 0 ${runtimes_targets} |& tee ninja_runtimes.log

From 5edf70c41c5d69f3751b4199f642f4585599dade Mon Sep 17 00:00:00 2001
From: mitchell <mitchell.xu2@gmail.com>
Date: Thu, 13 Nov 2025 15:47:09 +0800
Subject: [PATCH 06/29] [clang-tidy][docs][NFC] Enforce 80 characters limit
 (1/N) (#167492)

Fix documentation in `abseil`, `android`, `altera`, `boost` and
`bugprone`.

This is part of the codebase cleanup described in
[#167098](https://github.com/llvm/llvm-project/issues/167098)
---
 .../checks/abseil/duration-addition.rst       |  8 ++--
 .../checks/abseil/duration-division.rst       |  6 ++-
 .../abseil/faster-strsplit-delimiter.rst      |  6 +--
 .../abseil/string-find-str-contains.rst       |  4 +-
 .../abseil/upgrade-duration-conversions.rst   | 10 ++---
 .../checks/android/cloexec-inotify-init1.rst  |  7 +--
 .../checks/android/cloexec-pipe.rst           |  7 +--
 .../checks/android/cloexec-pipe2.rst          |  5 ++-
 .../comparison-in-temp-failure-retry.rst      |  4 +-
 .../clang-tidy/checks/boost/use-to-string.rst |  6 +--
 .../bugprone/assignment-in-if-condition.rst   | 10 +++--
 .../checks/bugprone/bitwise-pointer-cast.rst  |  6 +--
 .../capturing-this-in-member-variable.rst     | 10 ++---
 .../checks/bugprone/casting-through-void.rst  | 17 ++++---
 ...are-pointer-to-member-virtual-function.rst | 20 ++++-----
 .../copy-constructor-mutates-argument.rst     |  2 +-
 .../checks/bugprone/dangling-handle.rst       |  5 ++-
 .../derived-method-shadowing-base-method.rst  |  7 +--
 .../bugprone/dynamic-static-initializers.rst  | 16 ++++---
 .../bugprone/easily-swappable-parameters.rst  |  6 +--
 .../checks/bugprone/empty-catch.rst           |  8 ++--
 .../exception-copy-constructor-throws.rst     |  2 +-
 .../checks/bugprone/exception-escape.rst      |  6 +--
 .../checks/bugprone/fold-init-type.rst        | 11 ++---
 .../forward-declaration-namespace.rst         |  3 +-
 .../forwarding-reference-overload.rst         | 16 +++----
 .../checks/bugprone/inc-dec-in-conditions.rst | 25 ++++++-----
 .../checks/bugprone/infinite-loop.rst         |  8 ++--
 .../invalid-enum-default-initialization.rst   |  4 +-
 .../misplaced-operator-in-strlen-in-alloc.rst | 16 +++----
 .../multiple-new-in-one-expression.rst        | 16 +++----
 .../checks/bugprone/narrowing-conversions.rst | 27 ++++++------
 .../clang-tidy/checks/bugprone/no-escape.rst  |  4 +-
 .../non-zero-enum-to-bool-conversion.rst      | 20 ++++-----
 .../bugprone/not-null-terminated-result.rst   | 22 +++++-----
 .../checks/bugprone/posix-return.rst          |  3 +-
 .../checks/bugprone/random-generator-seed.rst | 10 ++---
 .../bugprone/redundant-branch-condition.rst   | 14 +++---
 .../return-const-ref-from-parameter.rst       | 16 +++----
 .../checks/bugprone/signal-handler.rst        |  8 ++--
 .../checks/bugprone/signed-char-misuse.rst    | 34 +++++++-------
 .../checks/bugprone/sizeof-container.rst      | 10 ++---
 .../checks/bugprone/sizeof-expression.rst     |  6 +--
 .../bugprone/std-namespace-modification.rst   | 15 ++++---
 .../string-literal-with-embedded-nul.rst      |  6 +--
 .../checks/bugprone/suspicious-enum-usage.rst | 22 +++++-----
 .../bugprone/suspicious-memory-comparison.rst |  6 ++-
 .../bugprone/suspicious-realloc-usage.rst     |  9 ++--
 .../checks/bugprone/suspicious-semicolon.rst  | 10 ++---
 .../bugprone/suspicious-string-compare.rst    |  4 +-
 .../checks/bugprone/swapped-arguments.rst     |  8 ++--
 .../bugprone/tagged-union-member-count.rst    | 44 ++++++++++---------
 .../checks/bugprone/throw-keyword-missing.rst |  7 +--
 .../throwing-static-initialization.rst        |  2 +-
 .../bugprone/too-small-loop-variable.rst      | 12 ++---
 .../bugprone/unchecked-optional-access.rst    | 15 ++++---
 .../unchecked-string-to-number-conversion.rst |  4 +-
 .../bugprone/unhandled-self-assignment.rst    | 13 +++---
 .../unintended-char-ostream-output.rst        | 29 ++++++------
 .../checks/bugprone/unsafe-functions.rst      | 15 ++++---
 .../checks/bugprone/unused-return-value.rst   |  3 +-
 .../checks/bugprone/use-after-move.rst        | 38 ++++++++--------
 .../checks/bugprone/virtual-near-miss.rst     |  5 ++-
 63 files changed, 382 insertions(+), 336 deletions(-)

diff --git a/clang-tools-extra/docs/clang-tidy/checks/abseil/duration-addition.rst b/clang-tools-extra/docs/clang-tidy/checks/abseil/duration-addition.rst
index ce2eefd9eea62..e8e3da00212aa 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/abseil/duration-addition.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/abseil/duration-addition.rst
@@ -3,10 +3,10 @@
 abseil-duration-addition
 ========================
 
-Check for cases where addition should be performed in the ``absl::Time`` domain.
-When adding two values, and one is known to be an ``absl::Time``, we can infer
-that the other should be interpreted as an ``absl::Duration`` of a similar
-scale, and make that inference explicit.
+Checks for cases where addition should be performed in the ``absl::Time``
+domain. When adding two values, and one is known to be an ``absl::Time``,
+we can infer that the other should be interpreted as an ``absl::Duration``
+of a similar scale, and make that inference explicit.
 
 Examples:
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/abseil/duration-division.rst b/clang-tools-extra/docs/clang-tidy/checks/abseil/duration-division.rst
index b7c8635600cd5..40c12d464687d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/abseil/duration-division.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/abseil/duration-division.rst
@@ -4,8 +4,10 @@ abseil-duration-division
 ========================
 
 ``absl::Duration`` arithmetic works like it does with integers. That means that
-division of two ``absl::Duration`` objects returns an ``int64`` with any fractional
-component truncated toward 0. See `this link <https://github.com/abseil/abseil-cpp/blob/29ff6d4860070bf8fcbd39c8805d0c32d56628a3/absl/time/time.h#L137>`_ for more information on arithmetic with ``absl::Duration``.
+division of two ``absl::Duration`` objects returns an ``int64`` with any
+fractional component truncated toward 0.
+See `this link <https://github.com/abseil/abseil-cpp/blob/29ff6d4860070bf8fcbd39c8805d0c32d56628a3/absl/time/time.h#L137>`_
+for more information on arithmetic with ``absl::Duration``.
 
 For example:
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/abseil/faster-strsplit-delimiter.rst b/clang-tools-extra/docs/clang-tidy/checks/abseil/faster-strsplit-delimiter.rst
index fe9115652b538..b5b79d405bded 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/abseil/faster-strsplit-delimiter.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/abseil/faster-strsplit-delimiter.rst
@@ -5,9 +5,9 @@ abseil-faster-strsplit-delimiter
 
 Finds instances of ``absl::StrSplit()`` or ``absl::MaxSplits()`` where the
 delimiter is a single character string literal and replaces with a character.
-The check will offer a suggestion to change the string literal into a character.
-It will also catch code using ``absl::ByAnyChar()`` for just a single character
-and will transform that into a single character as well.
+The check will offer a suggestion to change the string literal into a
+character. It will also catch code using ``absl::ByAnyChar()`` for just a
+single character and will transform that into a single character as well.
 
 These changes will give the same result, but using characters rather than
 single character string literals is more efficient and readable.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/abseil/string-find-str-contains.rst b/clang-tools-extra/docs/clang-tidy/checks/abseil/string-find-str-contains.rst
index 042fbdb3f29a9..418df193d6e0d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/abseil/string-find-str-contains.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/abseil/string-find-str-contains.rst
@@ -3,8 +3,8 @@
 abseil-string-find-str-contains
 ===============================
 
-Finds ``s.find(...) == string::npos`` comparisons (for various string-like types)
-and suggests replacing with ``absl::StrContains()``.
+Finds ``s.find(...) == string::npos`` comparisons (for various string-like
+types) and suggests replacing with ``absl::StrContains()``.
 
 This improves readability and reduces the likelihood of accidentally mixing
 ``find()`` and ``npos`` from different string-like types.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/abseil/upgrade-duration-conversions.rst b/clang-tools-extra/docs/clang-tidy/checks/abseil/upgrade-duration-conversions.rst
index 24e557d2edc32..3abdd40b1b813 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/abseil/upgrade-duration-conversions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/abseil/upgrade-duration-conversions.rst
@@ -8,17 +8,17 @@ argument needs an explicit cast to continue compiling after upcoming API
 changes.
 
 The operators ``*=``, ``/=``, ``*``, and ``/`` for ``absl::Duration`` currently
-accept an argument of class type that is convertible to an arithmetic type. Such
-a call currently converts the value to an ``int64_t``, even in a case such as
-``std::atomic<float>`` that would result in lossy conversion.
+accept an argument of class type that is convertible to an arithmetic type.
+Such a call currently converts the value to an ``int64_t``, even in a case such
+as ``std::atomic<float>`` that would result in lossy conversion.
 
 Additionally, the ``absl::Duration`` factory functions (``absl::Hours``,
 ``absl::Minutes``, etc) currently accept an ``int64_t`` or a floating-point
 type. Similar to the arithmetic operators, calls with an argument of class type
 that is convertible to an arithmetic type go through the ``int64_t`` path.
 
-These operators and factories will be changed to only accept arithmetic types to
-prevent unintended behavior. After these changes are released, passing an
+These operators and factories will be changed to only accept arithmetic types
+to prevent unintended behavior. After these changes are released, passing an
 argument of class type will no longer compile, even if the type is implicitly
 convertible to an arithmetic type.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/android/cloexec-inotify-init1.rst b/clang-tools-extra/docs/clang-tidy/checks/android/cloexec-inotify-init1.rst
index 827598ca7c282..741158964ec10 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/android/cloexec-inotify-init1.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/android/cloexec-inotify-init1.rst
@@ -3,9 +3,10 @@
 android-cloexec-inotify-init1
 =============================
 
-``inotify_init1()`` should include ``IN_CLOEXEC`` in its type argument to avoid the
-file descriptor leakage. Without this flag, an opened sensitive file would
-remain open across a fork+exec to a lower-privileged SELinux domain.
+``inotify_init1()`` should include ``IN_CLOEXEC`` in its type argument
+to avoid the file descriptor leakage. Without this flag, an opened
+sensitive file would remain open across a fork+exec to a
+lower-privileged SELinux domain.
 
 Examples:
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/android/cloexec-pipe.rst b/clang-tools-extra/docs/clang-tidy/checks/android/cloexec-pipe.rst
index b0504e9baeec2..c19e07f6c5dfe 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/android/cloexec-pipe.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/android/cloexec-pipe.rst
@@ -3,9 +3,10 @@
 android-cloexec-pipe
 ====================
 
-This check detects usage of ``pipe()``. Using ``pipe()`` is not recommended, ``pipe2()`` is the
-suggested replacement. The check also adds the O_CLOEXEC flag that marks the file descriptor to
-be closed in child processes. Without this flag a sensitive file descriptor can be leaked to a
+This check detects usage of ``pipe()``. Using ``pipe()`` is not recommended,
+``pipe2()`` is the suggested replacement. The check also adds the ``O_CLOEXEC``
+flag that marks the file descriptor to be closed in child processes.
+Without this flag a sensitive file descriptor can be leaked to a
 child process, potentially into a lower-privileged SELinux domain.
 
 Examples:
diff --git a/clang-tools-extra/docs/clang-tidy/checks/android/cloexec-pipe2.rst b/clang-tools-extra/docs/clang-tidy/checks/android/cloexec-pipe2.rst
index 9fb54bee0d571..dc9c6fc411177 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/android/cloexec-pipe2.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/android/cloexec-pipe2.rst
@@ -3,8 +3,9 @@
 android-cloexec-pipe2
 =====================
 
-This check ensures that pipe2() is called with the O_CLOEXEC flag. The check also
-adds the O_CLOEXEC flag that marks the file descriptor to be closed in child processes.
+This check ensures that ``pipe2()`` is called with the ``O_CLOEXEC`` flag.
+The check also adds the ``O_CLOEXEC`` flag that marks the file descriptor
+to be closed in child processes.
 Without this flag a sensitive file descriptor can be leaked to a child process,
 potentially into a lower-privileged SELinux domain.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/android/comparison-in-temp-failure-retry.rst b/clang-tools-extra/docs/clang-tidy/checks/android/comparison-in-temp-failure-retry.rst
index 31cc72b0579c4..42e8dd97452ef 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/android/comparison-in-temp-failure-retry.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/android/comparison-in-temp-failure-retry.rst
@@ -22,8 +22,8 @@ Example buggy usage looks like:
     // Do something with cs.
   }
 
-Because TEMP_FAILURE_RETRY will check for whether the result *of the comparison*
-is ``-1``, and retry if so.
+Because ``TEMP_FAILURE_RETRY`` will check for whether the result
+*of the comparison* is ``-1``, and retry if so.
 
 If you encounter this, the fix is simple: lift the comparison out of the
 ``TEMP_FAILURE_RETRY`` argument, like so:
diff --git a/clang-tools-extra/docs/clang-tidy/checks/boost/use-to-string.rst b/clang-tools-extra/docs/clang-tidy/checks/boost/use-to-string.rst
index 8365e80fb95a3..9a5bb888ba861 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/boost/use-to-string.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/boost/use-to-string.rst
@@ -3,9 +3,9 @@
 boost-use-to-string
 ===================
 
-This check finds conversion from integer type like ``int`` to ``std::string`` or
-``std::wstring`` using ``boost::lexical_cast``, and replace it with calls to
-``std::to_string`` and ``std::to_wstring``.
+This check finds conversion from integer type like ``int`` to
+``std::string`` or ``std::wstring`` using ``boost::lexical_cast``,
+and replace it with calls to ``std::to_string`` and ``std::to_wstring``.
 
 It doesn't replace conversion from floating points despite the ``to_string``
 overloads, because it would change the behavior.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/assignment-in-if-condition.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/assignment-in-if-condition.rst
index 691b6e4db096b..980e8146d0d64 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/assignment-in-if-condition.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/assignment-in-if-condition.rst
@@ -4,11 +4,13 @@ bugprone-assignment-in-if-condition
 ===================================
 
 Finds assignments within conditions of `if` statements.
-Such assignments are bug-prone because they may have been intended as equality tests.
+Such assignments are bug-prone because they may have been intended as
+equality tests.
 
-This check finds all assignments within `if` conditions, including ones that are not flagged
-by `-Wparentheses` due to an extra set of parentheses, and including assignments that call
-an overloaded `operator=()`. The identified assignments violate
+This check finds all assignments within `if` conditions, including ones that
+are not flagged by `-Wparentheses` due to an extra set of parentheses, and
+including assignments that call an overloaded ``operator=()``. The identified
+assignments violate
 `BARR group "Rule 8.2.c" <https://barrgroup.com/embedded-systems/books/embedded-c-coding-standard/statement-rules/if-else-statements>`_.
 
 .. code-block:: c++
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/bitwise-pointer-cast.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/bitwise-pointer-cast.rst
index ac58654421a0a..171e6e6157072 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/bitwise-pointer-cast.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/bitwise-pointer-cast.rst
@@ -31,9 +31,9 @@ not on pointer types:
     int x{};
     float y = std::bit_cast<float>(x);
 
-This way, the bytes of the input object are copied into the output object, which
-is much safer. Do note that Undefined Behavior can still occur, if there is no
-value of type ``To`` corresponding to the value representation produced.
+This way, the bytes of the input object are copied into the output object,
+which is much safer. Do note that Undefined Behavior can still occur, if there
+is no value of type ``To`` corresponding to the value representation produced.
 Compilers may be able to optimize this copy and generate identical assembly to
 the original ``reinterpret_cast`` version.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst
index 1017462b8806b..6a6ad7302566e 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst
@@ -6,11 +6,11 @@ bugprone-capturing-this-in-member-variable
 Finds lambda captures that capture the ``this`` pointer and store it as class
 members without handle the copy and move constructors and the assignments.
 
-Capture this in a lambda and store it as a class member is dangerous because the
-lambda can outlive the object it captures. Especially when the object is copied
-or moved, the captured ``this`` pointer will be implicitly propagated to the
-new object. Most of the time, people will believe that the captured ``this``
-pointer points to the new object, which will lead to bugs.
+Capture this in a lambda and store it as a class member is dangerous because
+the lambda can outlive the object it captures. Especially when the object is
+copied or moved, the captured ``this`` pointer will be implicitly propagated
+to the new object. Most of the time, people will believe that the captured
+``this`` pointer points to the new object, which will lead to bugs.
 
 .. code-block:: c++
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/casting-through-void.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/casting-through-void.rst
index 21efda444e2ff..3c0b52abea707 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/casting-through-void.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/casting-through-void.rst
@@ -13,16 +13,19 @@ Two-step type conversions via ``void*`` are discouraged for several reasons.
 - These conversions bypass valuable compiler support, erasing warnings related
   to pointer alignment. It may violate strict aliasing rule and leading to
   undefined behavior.
-- In scenarios involving multiple inheritance, ambiguity and unexpected outcomes
-  can arise due to the loss of type information, posing runtime issues.
+- In scenarios involving multiple inheritance, ambiguity and unexpected
+  outcomes can arise due to the loss of type information, posing runtime
+  issues.
 
-In summary, avoiding two-step type conversions through ``void*`` ensures clearer code,
-maintains essential compiler warnings, and prevents ambiguity and potential runtime
-errors, particularly in complex inheritance scenarios. If such a cast is wanted,
-it shall be done via ``reinterpret_cast``, to express the intent more clearly.
+In summary, avoiding two-step type conversions through ``void*`` ensures
+clearer code, maintains essential compiler warnings, and prevents ambiguity
+and potential runtime errors, particularly in complex inheritance scenarios.
+If such a cast is wanted, it shall be done via ``reinterpret_cast``,
+to express the intent more clearly.
 
 Note: it is expected that, after applying the suggested fix and using
-``reinterpret_cast``, the check :doc:`cppcoreguidelines-pro-type-reinterpret-cast
+``reinterpret_cast``, the check
+:doc:`cppcoreguidelines-pro-type-reinterpret-cast
 <../cppcoreguidelines/pro-type-reinterpret-cast>` will emit a warning.
 This is intentional: ``reinterpret_cast`` is a dangerous operation that can
 easily break the strict aliasing rules when dereferencing the casted pointer,
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/compare-pointer-to-member-virtual-function.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/compare-pointer-to-member-virtual-function.rst
index cc711d60e7fd8..6b2a82f1cfe96 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/compare-pointer-to-member-virtual-function.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/compare-pointer-to-member-virtual-function.rst
@@ -3,8 +3,8 @@
 bugprone-compare-pointer-to-member-virtual-function
 ===================================================
 
-Detects unspecified behavior about equality comparison between pointer to member
-virtual function and anything other than null-pointer-constant.
+Detects unspecified behavior about equality comparison between pointer to
+member virtual function and anything other than null-pointer-constant.
 
 .. code-block:: c++
 
@@ -47,11 +47,11 @@ becomes particularly challenging when dealing with pointers to pure virtual
 functions, as they may not even have a valid address, further complicating
 comparisons.
 
-Instead, it is recommended to utilize the ``typeid`` operator or other appropriate
-mechanisms for comparing objects to ensure robust and predictable behavior in
-your codebase. By heeding this detection and adopting a more reliable comparison
-method, you can mitigate potential issues related to unspecified behavior,
-especially when dealing with pointers to member virtual functions or pure
+Instead, it is recommended to utilize the ``typeid`` operator or other
+appropriate mechanisms for comparing objects to ensure robust and predictable
+behavior in your codebase. By heeding this detection and adopting a more reliable
+comparison method, you can mitigate potential issues related to unspecified
+behavior, especially when dealing with pointers to member virtual functions or pure
 virtual functions, thereby improving the overall stability and maintainability
 of your code. In scenarios involving pointers to member virtual functions, it's
 only advisable to employ ``nullptr`` for comparisons.
@@ -60,6 +60,6 @@ only advisable to employ ``nullptr`` for comparisons.
 Limitations
 -----------
 
-Does not analyze values stored in a variable. For variable, only analyze all virtual
-methods in the same ``class`` or ``struct`` and diagnose when assigning a pointer
-to member virtual function to this variable is possible.
+Does not analyze values stored in a variable. For variable, only analyze all
+virtual methods in the same ``class`` or ``struct`` and diagnose when assigning
+a pointer to member virtual function to this variable is possible.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/copy-constructor-mutates-argument.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/copy-constructor-mutates-argument.rst
index 28e5015beeaad..e45a94a0c9c0a 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/copy-constructor-mutates-argument.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/copy-constructor-mutates-argument.rst
@@ -8,4 +8,4 @@ in copy constructors and copy assignment operators.
 
 This check corresponds to the CERT C Coding Standard rule
 `OOP58-CPP. Copy operations must not mutate the source object
-<https://wiki.sei.cmu.edu/confluence/display/cplusplus/OOP58-CPP.+Copy+operations+must+not+mutate+the+source+object>`_.
\ No newline at end of file
+<https://wiki.sei.cmu.edu/confluence/display/cplusplus/OOP58-CPP.+Copy+operations+must+not+mutate+the+source+object>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/dangling-handle.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/dangling-handle.rst
index 752b711b4ef54..c25f8c4e7caaa 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/dangling-handle.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/dangling-handle.rst
@@ -4,8 +4,9 @@ bugprone-dangling-handle
 ========================
 
 Detect dangling references in value handles like ``std::string_view``.
-These dangling references can be a result of constructing handles from temporary
-values, where the temporary is destroyed soon after the handle is created.
+These dangling references can be a result of constructing handles from
+temporary values, where the temporary is destroyed soon after the handle
+is created.
 
 Examples:
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst
index aff3e1e6b6fb0..4906b501f9ff3 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.rst
@@ -10,11 +10,12 @@ In order to be considered "shadowing", methods must have the same signature
 Only checks public, non-templated methods.
 
 The below example is bugprone because consumers of the ``Derived`` class will
-expect the ``reset`` method to do the work of ``Base::reset()`` in addition to extra
-work required to reset the ``Derived`` class.  Common fixes include:
+expect the ``reset`` method to do the work of ``Base::reset()`` in addition to
+extra work required to reset the ``Derived`` class.  Common fixes include:
 
 - Making the ``reset`` method polymorphic
-- Re-naming ``Derived::reset`` if it's not meant to intersect with ``Base::reset``
+- Re-naming ``Derived::reset`` if it's not meant to intersect with
+  ``Base::reset``
 - Using ``using Base::reset`` to change the access specifier
 
 This is also a violation of the Liskov Substitution Principle.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/dynamic-static-initializers.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/dynamic-static-initializers.rst
index b4d5f75048a09..93d403cd112d4 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/dynamic-static-initializers.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/dynamic-static-initializers.rst
@@ -8,12 +8,12 @@ in header files.
 
 This can pose problems in certain multithreaded contexts. For example,
 when disabling compiler generated synchronization instructions for
-static variables initialized at runtime (e.g. by ``-fno-threadsafe-statics``), even if a particular project
-takes the necessary precautions to prevent race conditions during
-initialization by providing their own synchronization, header files included from other projects may
-not. Therefore, such a check is helpful for ensuring that disabling
-compiler generated synchronization for static variable initialization will not cause
-problems.
+static variables initialized at runtime (e.g. by ``-fno-threadsafe-statics``),
+even if a particular project takes the necessary precautions to prevent race
+conditions during initialization by providing their own synchronization, header
+files included from other projects may not. Therefore, such a check is helpful
+for ensuring that disabling compiler generated synchronization for static
+variable initialization will not cause problems.
 
 Consider the following code:
 
@@ -24,4 +24,6 @@ Consider the following code:
     return k;
   }
 
-When synchronization of static initialization is disabled, if two threads both call `foo` for the first time, there is the possibility that `k` will be double initialized, creating a race condition.
+When synchronization of static initialization is disabled, if two threads both
+call `foo` for the first time, there is the possibility that `k` will be double
+initialized, creating a race condition.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/easily-swappable-parameters.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/easily-swappable-parameters.rst
index a96d7f6015bda..59ccab4851dcc 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/easily-swappable-parameters.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/easily-swappable-parameters.rst
@@ -12,9 +12,9 @@ swapped (or badly ordered) arguments.
     void drawPoint(int X, int Y) { /* ... */ }
     FILE *open(const char *Dir, const char *Name, Flags Mode) { /* ... */ }
 
-A potential call like ``drawPoint(-2, 5)`` or ``openPath("a.txt", "tmp", Read)``
-is perfectly legal from the language's perspective, but might not be what the
-developer of the function intended.
+A potential call like ``drawPoint(-2, 5)`` or
+``openPath("a.txt", "tmp", Read)`` is perfectly legal from the language's
+perspective, but might not be what the developer of the function intended.
 
 More elaborate and type-safe constructs, such as opaque typedefs or strong
 types should be used instead, to prevent a mistaken order of arguments.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/empty-catch.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/empty-catch.rst
index fd1db594ddb24..87c7edc30f2d4 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/empty-catch.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/empty-catch.rst
@@ -118,10 +118,10 @@ Here is an example:
     it->second.callFunction();
   }
 
-In conclusion, empty catch statements are a bad practice that can lead to hidden
-bugs, security issues, poor code quality, and unreliable code. By handling
-exceptions properly, developers can ensure that their code is robust, secure,
-and maintainable.
+In conclusion, empty catch statements are a bad practice that can lead to
+hidden bugs, security issues, poor code quality, and unreliable code. By
+handling exceptions properly, developers can ensure that their code is
+robust, secure, and maintainable.
 
 Options
 -------
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst
index 8c3becf80a541..7170501328ade 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst
@@ -28,4 +28,4 @@ References
 
 This check corresponds to the CERT C++ Coding Standard rule
 `ERR60-CPP. Exception objects must be nothrow copy constructible
-<https://wiki.sei.cmu.edu/confluence/display/cplusplus/ERR60-CPP.+Exception+objects+must+be+nothrow+copy+constructible>`_.
\ No newline at end of file
+<https://wiki.sei.cmu.edu/confluence/display/cplusplus/ERR60-CPP.+Exception+objects+must+be+nothrow+copy+constructible>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst
index 7eaa333d5403a..7d724a4581715 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst
@@ -26,9 +26,9 @@ function also results in unexpected termination.
 
 Functions declared explicitly with ``noexcept(false)`` or ``throw(exception)``
 will be excluded from the analysis, as even though it is not recommended for
-functions like ``swap()``, ``main()``, move constructors, move assignment operators
-and destructors, it is a clear indication of the developer's intention and
-should be respected.
+functions like ``swap()``, ``main()``, move constructors, move assignment
+operators and destructors, it is a clear indication of the developer's
+intention and should be respected.
 
 WARNING! This check may be expensive on large source files.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/fold-init-type.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/fold-init-type.rst
index 8c6872d72f11a..fefad9f5e535b 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/fold-init-type.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/fold-init-type.rst
@@ -6,13 +6,14 @@ bugprone-fold-init-type
 The check flags type mismatches in
 `folds <https://en.wikipedia.org/wiki/Fold_(higher-order_function)>`_
 like ``std::accumulate`` that might result in loss of precision.
-``std::accumulate`` folds an input range into an initial value using the type of
-the latter, with ``operator+`` by default. This can cause loss of precision
-through:
+``std::accumulate`` folds an input range into an initial value using
+the type of the latter, with ``operator+`` by default. This can cause
+loss of precision through:
 
 - Truncation: The following code uses a floating point range and an int
-  initial value, so truncation will happen at every application of ``operator+``
-  and the result will be `0`, which might not be what the user expected.
+  initial value, so truncation will happen at every application of
+  ``operator+``  and the result will be `0`, which might not be what the
+  user expected.
 
 .. code-block:: c++
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/forward-declaration-namespace.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/forward-declaration-namespace.rst
index 99ecb63e7b9c6..5e04f660330d9 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/forward-declaration-namespace.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/forward-declaration-namespace.rst
@@ -17,4 +17,5 @@ the forward declaration is in a potentially wrong namespace.
   // warning : no definition found for 'A', but a definition with the same name
   // 'A' found in another namespace 'nb::'
 
-This check can only generate warnings, but it can't suggest a fix at this point.
+This check can only generate warnings, but it can't suggest a fix at this
+point.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/forwarding-reference-overload.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/forwarding-reference-overload.rst
index 6d70a50d2e9ca..cd079a35a38e2 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/forwarding-reference-overload.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/forwarding-reference-overload.rst
@@ -6,8 +6,8 @@ bugprone-forwarding-reference-overload
 The check looks for perfect forwarding constructors that can hide copy or move
 constructors. If a non const lvalue reference is passed to the constructor, the
 forwarding reference parameter will be a better match than the const reference
-parameter of the copy constructor, so the perfect forwarding constructor will be
-called, which can be confusing.
+parameter of the copy constructor, so the perfect forwarding constructor will
+be called, which can be confusing.
 For detailed description of this issue see: Scott Meyers, Effective Modern C++,
 Item 26.
 
@@ -47,12 +47,12 @@ Consider the following example:
       Person(const Person& rhs);
     };
 
-The check warns for constructors C1 and C2, because those can hide copy and move
-constructors. We suppress warnings if the copy and the move constructors are both
-disabled (deleted or private), because there is nothing the perfect forwarding
-constructor could hide in this case. We also suppress warnings for constructors
-like C3-C6 that are guarded with an ``enable_if`` or a concept, assuming the
-programmer was aware of the possible hiding.
+The check warns for constructors C1 and C2, because those can hide copy and
+move constructors. We suppress warnings if the copy and the move constructors
+are both disabled (deleted or private), because there is nothing the perfect
+forwarding constructor could hide in this case. We also suppress warnings for
+constructors like C3-C6 that are guarded with an ``enable_if`` or a concept,
+assuming the programmer was aware of the possible hiding.
 
 Background
 ----------
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/inc-dec-in-conditions.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/inc-dec-in-conditions.rst
index 380033ff09089..f3f0331b15b5e 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/inc-dec-in-conditions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/inc-dec-in-conditions.rst
@@ -7,9 +7,9 @@ Detects when a variable is both incremented/decremented and referenced inside a
 complex condition and suggests moving them outside to avoid ambiguity in the
 variable's value.
 
-When a variable is modified and also used in a complex condition, it can lead to
-unexpected behavior. The side-effect of changing the variable's value within the
-condition can make the code difficult to reason about. Additionally, the
+When a variable is modified and also used in a complex condition, it can lead
+to unexpected behavior. The side-effect of changing the variable's value within
+the condition can make the code difficult to reason about. Additionally, the
 developer's intended timing for the modification of the variable may not be
 clear, leading to misunderstandings and errors. This can be particularly
 problematic when the condition involves logical operators like ``&&`` and
@@ -44,8 +44,8 @@ throughout the code.
     // do something
   }
 
-Another common issue occurs when multiple increments or decrements are performed
-on the same variable inside a complex condition. For example:
+Another common issue occurs when multiple increments or decrements are
+performed on the same variable inside a complex condition. For example:
 
 .. code-block:: c++
 
@@ -55,13 +55,14 @@ on the same variable inside a complex condition. For example:
     // do something
   }
 
-There is a potential issue with this code due to the order of evaluation in C++.
-The ``||`` operator used in the condition statement guarantees that if the first
-operand evaluates to ``true``, the second operand will not be evaluated. This
-means that if ``i`` were initially ``4``, the first operand ``i < 5`` would
-evaluate to ``true`` and the second operand ``i > 2`` would not be evaluated.
-As a result, the decrement operation ``--i`` would not be executed and ``i``
-would hold value ``5``, which may not be the intended behavior for the developer.
+There is a potential issue with this code due to the order of evaluation in
+C++. The ``||`` operator used in the condition statement guarantees that if
+the first operand evaluates to ``true``, the second operand will not be
+evaluated. This means that if ``i`` were initially ``4``, the first operand
+``i < 5`` would evaluate to ``true`` and the second operand ``i > 2`` would
+not be evaluated. As a result, the decrement operation ``--i`` would not be
+executed and ``i`` would hold value ``5``, which may not be the intended
+behavior for the developer.
 
 To avoid this potential issue, the both increment and decrement operation on
 ``i`` should be moved outside the condition statement.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/infinite-loop.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/infinite-loop.rst
index 89502c1882a92..bbbc8773868a4 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/infinite-loop.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/infinite-loop.rst
@@ -11,15 +11,15 @@ However, it is possible to detect some obvious infinite loops, for example, if
 the loop condition is not changed. This check detects such loops. A loop is
 considered infinite if it does not have any loop exit statement (``break``,
 ``continue``, ``goto``, ``return``, ``throw`` or a call to a function called as
-``[[noreturn]]``) and all of the following conditions hold for every variable in
-the condition:
+``[[noreturn]]``) and all of the following conditions hold for every variable
+in the condition:
 
 - It is a local variable.
 - It has no reference or pointer aliases.
 - It is not a structure or class member.
 
-Furthermore, the condition must not contain a function call to consider the loop
-infinite since functions may return different values for different calls.
+Furthermore, the condition must not contain a function call to consider the
+loop infinite since functions may return different values for different calls.
 
 For example, the following loop is considered infinite `i` is not changed in
 the body:
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/invalid-enum-default-initialization.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/invalid-enum-default-initialization.rst
index 45cb878383a7d..fcbfce751828d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/invalid-enum-default-initialization.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/invalid-enum-default-initialization.rst
@@ -19,8 +19,8 @@ The check emits a warning only if an ``enum`` variable is default-initialized
 value of 0. The type can be a scoped or non-scoped ``enum``. Unions are not
 handled by the check (if it contains a member of enumeration type).
 
-Note that the ``enum`` ``std::errc`` is always ignored because it is expected to
-be default initialized, despite not defining an enumerator with the value 0.
+Note that the ``enum`` ``std::errc`` is always ignored because it is expected
+to be default initialized, despite not defining an enumerator with the value 0.
 
 .. code-block:: c++
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/misplaced-operator-in-strlen-in-alloc.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/misplaced-operator-in-strlen-in-alloc.rst
index 38df4803b45f3..c9a2a648578ce 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/misplaced-operator-in-strlen-in-alloc.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/misplaced-operator-in-strlen-in-alloc.rst
@@ -4,14 +4,14 @@ bugprone-misplaced-operator-in-strlen-in-alloc
 ==============================================
 
 Finds cases where ``1`` is added to the string in the argument to ``strlen()``,
-``strnlen()``, ``strnlen_s()``, ``wcslen()``, ``wcsnlen()``, and ``wcsnlen_s()``
-instead of the result and the value is used as an argument to a memory
-allocation function (``malloc()``, ``calloc()``, ``realloc()``, ``alloca()``) or
-the ``new[]`` operator in `C++`. The check detects error cases even if one of
-these functions (except the ``new[]`` operator) is called by a constant function
-pointer. Cases where ``1`` is added both to the parameter and the result of the
-``strlen()``-like function are ignored, as are cases where the whole addition is
-surrounded by extra parentheses.
+``strnlen()``, ``strnlen_s()``, ``wcslen()``, ``wcsnlen()``, and
+``wcsnlen_s()`` instead of the result and the value is used as an argument to a
+memory allocation function (``malloc()``, ``calloc()``, ``realloc()``,
+``alloca()``) or the ``new[]`` operator in `C++`. The check detects error cases
+even if one of these functions (except the ``new[]`` operator) is called by a
+constant function pointer. Cases where ``1`` is added both to the parameter and
+the result of the ``strlen()``-like function are ignored, as are cases where
+the whole addition is surrounded by extra parentheses.
 
 `C` example code:
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/multiple-new-in-one-expression.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/multiple-new-in-one-expression.rst
index b9b8984ef6584..154013da516d3 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/multiple-new-in-one-expression.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/multiple-new-in-one-expression.rst
@@ -10,11 +10,11 @@ and throws exception.
 C++ does often not specify the exact order of evaluation of the operands of an
 operator or arguments of a function. Therefore if a first allocation succeeds
 and a second fails, in an exception handler it is not possible to tell which
-allocation has failed and free the memory. Even if the order is fixed the result
-of a first ``new`` may be stored in a temporary location that is not reachable
-at the time when a second allocation fails. It is best to avoid any expression
-that contains more than one ``operator new`` call, if exception handling is
-used to check for allocation errors.
+allocation has failed and free the memory. Even if the order is fixed the
+result of a first ``new`` may be stored in a temporary location that is not
+reachable at the time when a second allocation fails. It is best to avoid any
+expression that contains more than one ``operator new`` call, if exception
+handling is used to check for allocation errors.
 
 Different rules apply for are the short-circuit operators ``||`` and ``&&`` and
 the ``,`` operator, where evaluation of one side must be completed before the
@@ -31,9 +31,9 @@ For any warning to be emitted the ``new`` calls should be in a code block where
 exception handling is used with catch for ``std::bad_alloc`` or
 ``std::exception``. At ``||``, ``&&``, ``,``, ``?`` (condition and one branch)
 operators no warning is emitted. No warning is emitted if both of the memory
-allocations are not assigned to a variable or not passed directly to a function.
-The reason is that in this case the memory may be intentionally not freed or the
-allocated objects can be self-destructing objects.
+allocations are not assigned to a variable or not passed directly to a
+function. The reason is that in this case the memory may be intentionally not
+freed or the allocated objects can be self-destructing objects.
 
 Examples:
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/narrowing-conversions.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/narrowing-conversions.rst
index 1a1217ed5a21c..4327bc09babab 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/narrowing-conversions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/narrowing-conversions.rst
@@ -3,7 +3,8 @@
 bugprone-narrowing-conversions
 ==============================
 
-`cppcoreguidelines-narrowing-conversions` redirects here as an alias for this check.
+`cppcoreguidelines-narrowing-conversions` redirects here as an alias for
+this check.
 
 Checks for silent narrowing conversions, e.g: ``int i = 0; i += 0.1;``. While
 the issue is obvious in this former example, it might not be so in the
@@ -19,8 +20,8 @@ We flag narrowing conversions from:
    if WarnOnFloatingPointNarrowingConversion Option is set.
 
 This check will flag:
- - All narrowing conversions that are not marked by an explicit cast (c-style or
-   ``static_cast``). For example: ``int i = 0; i += 0.1;``,
+ - All narrowing conversions that are not marked by an explicit cast (c-style
+   or ``static_cast``). For example: ``int i = 0; i += 0.1;``,
    ``void f(int); f(0.1);``,
  - All applications of binary operators with a narrowing conversions.
    For example: ``int i; i+= 0.1;``.
@@ -110,17 +111,17 @@ FAQ
  - What does "narrowing conversion from 'int' to 'float'" mean?
 
 An IEEE754 Floating Point number can represent all integer values in the range
-[-2^PrecisionBits, 2^PrecisionBits] where PrecisionBits is the number of bits in
-the mantissa.
+[-2^PrecisionBits, 2^PrecisionBits] where PrecisionBits is the number of bits
+in the mantissa.
 
-For ``float`` this would be [-2^23, 2^23], where ``int`` can represent values in
-the range [-2^31, 2^31-1].
+For ``float`` this would be [-2^23, 2^23], where ``int`` can represent values
+in the range [-2^31, 2^31-1].
 
  - What does "implementation-defined" mean?
 
-You may have encountered messages like "narrowing conversion from 'unsigned int'
-to signed type 'int' is implementation-defined".
-The C/C++ standard does not mandate two's complement for signed integers, and so
-the compiler is free to define what the semantics are for converting an unsigned
-integer to signed integer. Clang's implementation uses the two's complement
-format.
+You may have encountered messages like "narrowing conversion from 'unsigned
+int' to signed type 'int' is implementation-defined".
+The C/C++ standard does not mandate two's complement for signed integers, and
+so the compiler is free to define what the semantics are for converting an
+unsigned integer to signed integer. Clang's implementation uses the two's
+complement format.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/no-escape.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/no-escape.rst
index 770a71cc04255..850b01c4d76f6 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/no-escape.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/no-escape.rst
@@ -5,8 +5,8 @@ bugprone-no-escape
 
 Finds pointers with the ``noescape`` attribute that are captured by an
 asynchronously-executed block. The block arguments in ``dispatch_async()`` and
-``dispatch_after()`` are guaranteed to escape, so it is an error if a pointer with the
-``noescape`` attribute is captured by one of these blocks.
+``dispatch_after()`` are guaranteed to escape, so it is an error if a pointer
+with the ``noescape`` attribute is captured by one of these blocks.
 
 The following is an example of an invalid use of the ``noescape`` attribute.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/non-zero-enum-to-bool-conversion.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/non-zero-enum-to-bool-conversion.rst
index 168ed71674773..0ae950d75316e 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/non-zero-enum-to-bool-conversion.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/non-zero-enum-to-bool-conversion.rst
@@ -3,24 +3,24 @@
 bugprone-non-zero-enum-to-bool-conversion
 =========================================
 
-Detect implicit and explicit casts of ``enum`` type into ``bool`` where ``enum``
-type doesn't have a zero-value enumerator. If the ``enum`` is used only to hold
-values equal to its enumerators, then conversion to ``bool`` will always result
-in ``true`` value. This can lead to unnecessary code that reduces readability
-and maintainability and can result in bugs.
+Detect implicit and explicit casts of ``enum`` type into ``bool`` where
+``enum`` type doesn't have a zero-value enumerator. If the ``enum`` is used
+only to hold values equal to its enumerators, then conversion to ``bool`` will
+always result in ``true`` value. This can lead to unnecessary code that reduces
+readability and maintainability and can result in bugs.
 
 May produce false positives if the ``enum`` is used to store other values
 (used as a bit-mask or zero-initialized on purpose). To deal with them,
-``// NOLINT`` or casting first to the underlying type before casting to ``bool``
-can be used.
+``// NOLINT`` or casting first to the underlying type before casting to
+``bool`` can be used.
 
 It is important to note that this check will not generate warnings if the
 definition of the enumeration type is not available.
 Additionally, C++11 enumeration classes are supported by this check.
 
-Overall, this check serves to improve code quality and readability by identifying
-and flagging instances where implicit or explicit casts from enumeration types to
-boolean could cause potential issues.
+Overall, this check serves to improve code quality and readability by
+identifying and flagging instances where implicit or explicit casts from
+enumeration types to boolean could cause potential issues.
 
 Example
 -------
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/not-null-terminated-result.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/not-null-terminated-result.rst
index ed7d65c3a89be..db86e94063ec0 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/not-null-terminated-result.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/not-null-terminated-result.rst
@@ -3,10 +3,11 @@
 bugprone-not-null-terminated-result
 ===================================
 
-Finds function calls where it is possible to cause a not null-terminated result.
-Usually the proper length of a string is ``strlen(src) + 1`` or equal length of
-this expression, because the null terminator needs an extra space. Without the
-null terminator it can result in undefined behavior when the string is read.
+Finds function calls where it is possible to cause a not null-terminated
+result. Usually the proper length of a string is ``strlen(src) + 1`` or equal
+length of this expression, because the null terminator needs an extra space.
+Without the null terminator it can result in undefined behavior when the
+string is read.
 
 The following and their respective ``wchar_t`` based functions are checked:
 
@@ -25,8 +26,8 @@ of the allocated memory is not enough to hold the null terminator.
     return result;
   }
 
-In addition to issuing warnings, fix-it rewrites all the necessary code. It also
-tries to adjust the capacity of the destination array:
+In addition to issuing warnings, fix-it rewrites all the necessary code.
+It also tries to adjust the capacity of the destination array:
 
 .. code-block:: c
 
@@ -62,8 +63,8 @@ Rewrite based on the destination array
   the safe version (ending with ``cpy_s``).
 
 - If the new function is could be safe version and C++ files are analyzed and
-  the destination array is plain ``char``/``wchar_t`` without ``un/signed`` then
-  the length of the destination array can be omitted.
+  the destination array is plain ``char``/``wchar_t`` without ``un/signed``
+  then the length of the destination array can be omitted.
 
 - If the new function is could be safe version and the destination array is
   ``un/signed`` it needs to be casted to plain ``char *``/``wchar_t *``.
@@ -76,8 +77,9 @@ Rewrite based on the length of the source string
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 - If the given length is ``strlen(source)`` or equal length of this expression
-  then the new function should be the older copy function (ending with ``cpy``),
-  as it is more efficient than the safe version (ending with ``cpy_s``).
+  then the new function should be the older copy function (ending with
+  ``cpy``), as it is more efficient than the safe version (ending with
+  ``cpy_s``).
 
 - Otherwise we assume that the programmer wanted to copy 'N' characters, so the
   new function is ``ncpy``-like which copies 'N' characters.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/posix-return.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/posix-return.rst
index a5c4ccb0c4d7c..1a4a3ed3b9a37 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/posix-return.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/posix-return.rst
@@ -13,7 +13,8 @@ Example buggy usage looks like:
 
   if (posix_fadvise(...) < 0) {
 
-This will never happen as the return value is always non-negative. A simple fix could be:
+This will never happen as the return value is always non-negative.
+A simple fix could be:
 
 .. code-block:: c
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/random-generator-seed.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/random-generator-seed.rst
index 25712447f7897..25680994a58d2 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/random-generator-seed.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/random-generator-seed.rst
@@ -4,10 +4,10 @@ bugprone-random-generator-seed
 ==============================
 
 Flags all pseudo-random number engines, engine adaptor
-instantiations and ``srand()`` when initialized or seeded with default argument,
-constant expression or any user-configurable type. Pseudo-random number
-engines seeded with a predictable value may cause vulnerabilities e.g. in
-security protocols.
+instantiations and ``srand()`` when initialized or seeded with default
+argument, constant expression or any user-configurable type. Pseudo-random
+number engines seeded with a predictable value may cause vulnerabilities
+e.g. in security protocols.
 
 Examples:
 
@@ -41,4 +41,4 @@ This check corresponds to the CERT C++ Coding Standard rules
 `MSC51-CPP. Ensure your random number generator is properly seeded
 <https://wiki.sei.cmu.edu/confluence/display/cplusplus/MSC51-CPP.+Ensure+your+random+number+generator+is+properly+seeded>`_ and
 `MSC32-C. Properly seed pseudorandom number generators
-<https://wiki.sei.cmu.edu/confluence/display/c/MSC32-C.+Properly+seed+pseudorandom+number+generators>`_.
\ No newline at end of file
+<https://wiki.sei.cmu.edu/confluence/display/c/MSC32-C.+Properly+seed+pseudorandom+number+generators>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/redundant-branch-condition.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/redundant-branch-condition.rst
index c2efff8dec1cd..7a321bd9c0f06 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/redundant-branch-condition.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/redundant-branch-condition.rst
@@ -16,9 +16,9 @@ Simple example:
       scream();
   }
 
-Here `onFire` is checked both in the outer ``if`` and the inner ``if`` statement
-without a possible change between the two checks. The check warns for this code
-and suggests removal of the second checking of variable `onFire`.
+Here `onFire` is checked both in the outer ``if`` and the inner ``if``
+statement without a possible change between the two checks. The check warns for
+this code and suggests removal of the second checking of variable `onFire`.
 
 The checker also detects redundant condition checks if the condition variable
 is an operand of a logical "and" (``&&``) or a logical "or" (``||``) operator:
@@ -44,8 +44,8 @@ condition variable and keep the other side of the ``&&``. In the second case
 (logical "or") the whole ``if`` is removed similarly to the simple case on the
 top.
 
-The condition of the outer ``if`` statement may also be a logical "and" (``&&``)
-expression:
+The condition of the outer ``if`` statement may also be a logical "and"
+(``&&``) expression:
 
 .. code-block:: c
 
@@ -62,8 +62,8 @@ The error is also detected if both the outer statement is a logical "and"
 The inner ``if`` statement does not have to be a direct descendant of the outer
 one.
 
-No error is detected if the condition variable may have been changed between the
-two checks:
+No error is detected if the condition variable may have been changed between
+the two checks:
 
 .. code-block:: c
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
index 00759a2ca003b..663e2149c7ac9 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
@@ -3,14 +3,14 @@
 bugprone-return-const-ref-from-parameter
 ========================================
 
-Detects return statements that return a constant reference parameter as constant
-reference. This may cause use-after-free errors if the caller uses xvalues as
-arguments.
-
-In C++, constant reference parameters can accept xvalues which will be destructed
-after the call. When the function returns such a parameter also as constant reference,
-then the returned reference can be used after the object it refers to has been
-destroyed.
+Detects return statements that return a constant reference parameter as
+constant reference. This may cause use-after-free errors if the caller
+uses xvalues as arguments.
+
+In C++, constant reference parameters can accept xvalues which will be
+destructed after the call. When the function returns such a parameter also
+as constant reference, then the returned reference can be used after the
+object it refers to has been destroyed.
 
 Example
 -------
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/signal-handler.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/signal-handler.rst
index 848fb667e1823..aef27942b9e92 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/signal-handler.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/signal-handler.rst
@@ -20,10 +20,10 @@ Checked signal handler rules for up to and including C++14:
 
 The check is disabled on C++17 and later.
 
-Asynchronous-safety is determined by comparing the function's name against a set
-of known functions. In addition, the function must come from a system header
-include and in a global namespace. The (possible) arguments passed to the
-function are not checked. Any function that cannot be determined to be
+Asynchronous-safety is determined by comparing the function's name against a
+set of known functions. In addition, the function must come from a system
+header include and in a global namespace. The (possible) arguments passed to
+the function are not checked. Any function that cannot be determined to be
 asynchronous-safe is assumed to be non-asynchronous-safe by the check,
 including user functions for which only the declaration is visible.
 Calls to user-defined functions with visible definitions are checked
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/signed-char-misuse.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/signed-char-misuse.rst
index 3e06e11dffcc7..4b98c36ee84c9 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/signed-char-misuse.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/signed-char-misuse.rst
@@ -17,30 +17,34 @@ human programmer probably expects that the converted value matches with the
 character code (a value from [0..255]), however, the actual value is in
 [-128..127] interval. To avoid this kind of misinterpretation, the desired way
 of converting from a ``signed char`` to an integer value is converting to
-``unsigned char`` first, which stores all the characters in the positive [0..255]
-interval which matches the known character codes.
+``unsigned char`` first, which stores all the characters in the positive
+[0..255] interval which matches the known character codes.
 
 In case of implicit conversion, the programmer might not actually be aware
 that a conversion happened and char value is used as an integer. There are
-some use cases when this unawareness might lead to a functionally imperfect code.
-For example, checking the equality of a ``signed char`` and an ``unsigned char``
-variable is something we should avoid in C++ code. During this comparison,
-the two variables are converted to integers which have different value ranges.
-For ``signed char``, the non-ASCII characters are stored as a value in [-128..-1]
-interval, while the same characters are stored in the [128..255] interval for
-an ``unsigned char``.
-
-It depends on the actual platform whether plain ``char`` is handled as ``signed char``
-by default and so it is caught by this check or not. To change the default behavior
-you can use ``-funsigned-char`` and ``-fsigned-char`` compilation options.
+some use cases when this unawareness might lead to a functionally imperfect
+code. For example, checking the equality of a ``signed char`` and an
+``unsigned char`` variable is something we should avoid in C++ code. During
+this comparison, the two variables are converted to integers which have
+different value ranges. For ``signed char``, the non-ASCII characters are
+stored as a value in [-128..-1] interval, while the same characters are
+stored in the [128..255] interval for an ``unsigned char``.
+
+It depends on the actual platform whether plain ``char`` is handled as
+``signed char`` by default and so it is caught by this check or not.
+To change the default behavior you can use ``-funsigned-char`` and
+``-fsigned-char`` compilation options.
 
 Currently, this check warns in the following cases:
+
 - ``signed char`` is assigned to an integer variable
-- ``signed char`` and ``unsigned char`` are compared with equality/inequality operator
+- ``signed char`` and ``unsigned char`` are compared with
+  equality/inequality operator
 - ``signed char`` is converted to an integer in the array subscript
 
 See also:
-`STR34-C. Cast characters to unsigned char before converting to larger integer sizes
+`STR34-C. Cast characters to unsigned char before converting to larger
+integer sizes
 <https://wiki.sei.cmu.edu/confluence/display/c/STR34-C.+Cast+characters+to+unsigned+char+before+converting+to+larger+integer+sizes>`_
 
 A good example from the CERT description when a ``char`` variable is used to
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-container.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-container.rst
index fb2f0b2a6801c..5d70d45a91357 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-container.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-container.rst
@@ -3,12 +3,12 @@
 bugprone-sizeof-container
 =========================
 
-The check finds usages of ``sizeof`` on expressions of STL container types. Most
-likely the user wanted to use ``.size()`` instead.
+The check finds usages of ``sizeof`` on expressions of STL container types.
+Most likely the user wanted to use ``.size()`` instead.
 
-All class/struct types declared in namespace ``std::`` having a const ``size()``
-method are considered containers, with the exception of ``std::bitset`` and
-``std::array``.
+All class/struct types declared in namespace ``std::`` having a const
+``size()`` method are considered containers, with the exception of
+``std::bitset`` and ``std::array``.
 
 Examples:
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst
index 09be75c9de03a..aa2e529628c0e 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst
@@ -73,9 +73,9 @@ Suspicious usage of 'sizeof(char*)'
 
 There is a subtle difference between declaring a string literal with
 ``char* A = ""`` and ``char A[] = ""``. The first case has the type ``char*``
-instead of the aggregate type ``char[]``. Using ``sizeof`` on an object declared
-with ``char*`` type is returning the size of a pointer instead of the number of
-characters (bytes) in the string literal.
+instead of the aggregate type ``char[]``. Using ``sizeof`` on an object
+declared with ``char*`` type is returning the size of a pointer instead of
+the number of characters (bytes) in the string literal.
 
 .. code-block:: c++
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst
index c6e5608280264..56d2559be904e 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst
@@ -10,13 +10,18 @@ The ``std`` (or ``posix``) namespace is allowed to be extended with (class or
 function) template specializations that depend on an user-defined type (a type
 that is not defined in the standard system headers).
 
-The check detects the following (user provided) declarations in namespace ``std`` or ``posix``:
+The check detects the following (user provided) declarations in namespace
+``std`` or ``posix``:
 
 - Anything that is not a template specialization.
-- Explicit specializations of any standard library function template or class template, if it does not have any user-defined type as template argument.
-- Explicit specializations of any member function of a standard library class template.
-- Explicit specializations of any member function template of a standard library class or class template.
-- Explicit or partial specialization of any member class template of a standard library class or class template.
+- Explicit specializations of any standard library function template or class
+  template, if it does not have any user-defined type as template argument.
+- Explicit specializations of any member function of a standard library class
+  template.
+- Explicit specializations of any member function template of a standard
+  library class or class template.
+- Explicit or partial specialization of any member class template of a standard
+  library class or class template.
 
 Examples:
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/string-literal-with-embedded-nul.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/string-literal-with-embedded-nul.rst
index c1c4d3261dfaf..bc5f2ce2cc885 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/string-literal-with-embedded-nul.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/string-literal-with-embedded-nul.rst
@@ -21,9 +21,9 @@ like this ``\0x42`` where the ``\0`` stands for the NUL character.
 Truncated literal
 -----------------
 
-String-like classes can manipulate strings with embedded NUL as they are keeping
-track of the bytes and the length. This is not the case for a ``char*``
-(NUL-terminated) string.
+String-like classes can manipulate strings with embedded NUL as they are
+keeping track of the bytes and the length. This is not the case for a
+``char*`` (NUL-terminated) string.
 
 A common mistake is to pass a string-literal with embedded NUL to a string
 constructor expecting a NUL-terminated string. The bytes after the first NUL
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-enum-usage.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-enum-usage.rst
index 94f29ee11ee39..94e3db9770cbc 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-enum-usage.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-enum-usage.rst
@@ -3,11 +3,11 @@
 bugprone-suspicious-enum-usage
 ==============================
 
-The checker detects various cases when an enum is probably misused (as a bitmask
-).
+The checker detects various cases when an enum is probably misused
+(as a bitmask).
 
-1. When "ADD" or "bitwise OR" is used between two enum which come from different
-   types and these types value ranges are not disjoint.
+1. When "ADD" or "bitwise OR" is used between two enum which come
+   from different types and these types value ranges are not disjoint.
 
 The following cases will be investigated only using :option:`StrictMode`. We
 regard the enum as a (suspicious)
@@ -17,17 +17,17 @@ bitmask if the three conditions below are true at the same time:
   short enumerations)
 * there is another non pow-of-2 number than the enum constant representing all
   choices (the result "bitwise OR" operation of all enum elements)
-* enum type variable/enumconstant is used as an argument of a `+` or "bitwise OR
-  " operator
+* enum type variable/enumconstant is used as an argument of a `+` or "bitwise
+  OR" operator
 
 So whenever the non pow-of-2 element is used as a bitmask element we diagnose a
 misuse and give a warning.
 
-2. Investigating the right hand side of `+=` and `|=` operator.
-3. Check only the enum value side of a `|` and `+` operator if one of them is not
-   enum val.
-4. Check both side of `|` or `+` operator where the enum values are from the
-   same enum type.
+2. Investigating the right hand side of ``+=`` and ``|=`` operator.
+3. Check only the enum value side of a ``|`` and ``+`` operator if one of
+   them is not enum val.
+4. Check both side of ``|`` or ``+`` operator where the enum values are from
+   the same enum type.
 
 Examples:
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-memory-comparison.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-memory-comparison.rst
index f82863f7c2f18..317f8e1839597 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-memory-comparison.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-memory-comparison.rst
@@ -24,10 +24,12 @@ and
 <https://wiki.sei.cmu.edu/confluence/display/c/FLP37-C.+Do+not+use+object+representations+to+compare+floating-point+values>`_
 
 This check is also related to and partially overlaps the CERT C++ Coding Standard rules
-`OOP57-CPP. Prefer special member functions and overloaded operators to C Standard Library functions
+`OOP57-CPP. Prefer special member functions and overloaded operators to
+C Standard Library functions
 <https://wiki.sei.cmu.edu/confluence/display/cplusplus/OOP57-CPP.+Prefer+special+member+functions+and+overloaded+operators+to+C+Standard+Library+functions>`_
 and
-`EXP62-CPP. Do not access the bits of an object representation that are not part of the object's value representation
+`EXP62-CPP. Do not access the bits of an object representation that are not
+part of the object's value representation
 <https://wiki.sei.cmu.edu/confluence/display/cplusplus/EXP62-CPP.+Do+not+access+the+bits+of+an+object+representation+that+are+not+part+of+the+object%27s+value+representation>`_
 
 `cert-exp42-c` redirects here as an alias of this check.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-realloc-usage.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-realloc-usage.rst
index 25a0d8885689b..9885d9c2ae9ff 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-realloc-usage.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-realloc-usage.rst
@@ -3,8 +3,8 @@
 bugprone-suspicious-realloc-usage
 =================================
 
-This check finds usages of ``realloc`` where the return value is assigned to the
-same expression as passed to the first argument:
+This check finds usages of ``realloc`` where the return value is assigned to
+the same expression as passed to the first argument:
 ``p = realloc(p, size);``
 The problem with this construct is that if ``realloc`` fails it returns a
 null pointer but does not deallocate the original memory. If no other variable
@@ -12,8 +12,9 @@ is pointing to it, the original memory block is not available any more for the
 program to use or free. In either case ``p = realloc(p, size);`` indicates bad
 coding style and can be replaced by ``q = realloc(p, size);``.
 
-The pointer expression (used at ``realloc``) can be a variable or a field member
-of a data structure, but can not contain function calls or unresolved types.
+The pointer expression (used at ``realloc``) can be a variable or a field
+member of a data structure, but can not contain function calls or unresolved
+types.
 
 In obvious cases when the pointer used at realloc is assigned to another
 variable before the ``realloc`` call, no warning is emitted. This happens only
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-semicolon.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-semicolon.rst
index 76c891f3def4a..56e23d77024cb 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-semicolon.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-semicolon.rst
@@ -5,9 +5,9 @@ bugprone-suspicious-semicolon
 
 Finds most instances of stray semicolons that unexpectedly alter the meaning of
 the code. More specifically, it looks for ``if``, ``while``, ``for`` and
-``for-range`` statements whose body is a single semicolon, and then analyzes the
-context of the code (e.g. indentation) in an attempt to determine whether that
-is intentional.
+``for-range`` statements whose body is a single semicolon, and then analyzes
+the context of the code (e.g. indentation) in an attempt to determine whether
+that is intentional.
 
 .. code-block:: c++
 
@@ -26,8 +26,8 @@ of the first line, and `x` will be incremented regardless of the condition.
       processLine(line);
 
 As a result of this code, `processLine()` will only be called once, when the
-``while`` loop with the empty body exits with `line == NULL`. The indentation of
-the code indicates the intention of the programmer.
+``while`` loop with the empty body exits with ``line == NULL``. The indentation
+of the code indicates the intention of the programmer.
 
 
 .. code-block:: c++
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-string-compare.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-string-compare.rst
index 85e17967728bd..973b70393faf0 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-string-compare.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-string-compare.rst
@@ -30,8 +30,8 @@ A common mistake is to compare the result to `1` or `-1`.
     if (strcmp(...) == -1)  // Incorrect usage of the returned value.
 
 Additionally, the check warns if the results value is implicitly cast to a
-*suspicious* non-integer type. It's happening when the returned value is used in
-a wrong context.
+*suspicious* non-integer type. It's happening when the returned value is
+used in a wrong context.
 
 .. code-block:: c++
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/swapped-arguments.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/swapped-arguments.rst
index 674108f9d01ed..e798b67937170 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/swapped-arguments.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/swapped-arguments.rst
@@ -34,8 +34,8 @@ dealing with floating-point arguments, implicit casts between different
 floating-point types are considered acceptable.
 
 To avoid confusion, swaps where both swapped arguments are of integral types or
-both are of floating-point types do not trigger the warning. In such cases, it's
-assumed that the developer intentionally used different integral or
+both are of floating-point types do not trigger the warning. In such cases,
+it's assumed that the developer intentionally used different integral or
 floating-point types and does not raise a warning. This approach prevents false
-positives and provides flexibility in handling situations where varying integral
-or floating-point types are intentionally utilized.
+positives and provides flexibility in handling situations where varying
+integral or floating-point types are intentionally utilized.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst
index a3469dc451562..5ac5e3240d7a6 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/tagged-union-member-count.rst
@@ -53,10 +53,10 @@ How enum constants are counted
 ------------------------------
 
 The main complicating factor when counting the number of enum constants is that
-some of them might be auxiliary values that purposefully don't have a corresponding union
-data member and are used for something else. For example the last enum constant
-sometimes explicitly "points to" the last declared valid enum constant or
-tracks how many enum constants have been declared.
+some of them might be auxiliary values that purposefully don't have a
+corresponding union data member and are used for something else. For example
+the last enum constant sometimes explicitly "points to" the last declared valid
+enum constant or tracks how many enum constants have been declared.
 
 For an illustration:
 
@@ -76,23 +76,24 @@ For an illustration:
     TagCount, // is 3
   };
 
-The check counts the number of distinct values among the enum constants and not the enum
-constants themselves. This way the enum constants that are essentially just aliases of other
-enum constants are not included in the final count.
+The check counts the number of distinct values among the enum constants and not
+the enum constants themselves. This way the enum constants that are essentially
+just aliases of other enum constants are not included in the final count.
 
-Handling of counting enum constants (ones like :code:`TagCount` in the previous code example)
-is done by decreasing the number of enum values by one if the name of the last enum constant
-starts with a prefix or ends with a suffix specified in :option:`CountingEnumPrefixes`,
-:option:`CountingEnumSuffixes` and it's value is one less than the total number of distinct
-values in the enum.
+Handling of counting enum constants (ones like :code:`TagCount` in the previous
+code example) is done by decreasing the number of enum values by one if the name
+of the last enum constant starts with a prefix or ends with a suffix specified in
+:option:`CountingEnumPrefixes`, :option:`CountingEnumSuffixes` and it's value is
+one less than the total number of distinct values in the enum.
 
-When the final count is adjusted based on this heuristic then a diagnostic note is emitted
-that shows which enum constant matched the criteria.
+When the final count is adjusted based on this heuristic then a diagnostic note
+is emitted that shows which enum constant matched the criteria.
 
-The heuristic can be disabled entirely (:option:`EnableCountingEnumHeuristic`) or
-configured to follow your naming convention (:option:`CountingEnumPrefixes`, :option:`CountingEnumSuffixes`).
-The strings specified in :option:`CountingEnumPrefixes`, :option:`CountingEnumSuffixes` are matched
-case insensitively.
+The heuristic can be disabled entirely (:option:`EnableCountingEnumHeuristic`)
+or configured to follow your naming convention (:option:`CountingEnumPrefixes`,
+:option:`CountingEnumSuffixes`).
+The strings specified in :option:`CountingEnumPrefixes`,
+:option:`CountingEnumSuffixes` are matched case insensitively.
 
 Example counts:
 
@@ -184,8 +185,8 @@ If :option:`EnableCountingEnumHeuristic` is `false` then these options do nothin
 The default value of :option:`CountingEnumSuffixes` is `count` and of
 :option:`CountingEnumPrefixes` is the empty string.
 
-When :option:`EnableCountingEnumHeuristic` is `true` and :option:`CountingEnumSuffixes`
-is `count;size`:
+When :option:`EnableCountingEnumHeuristic` is `true` and
+:option:`CountingEnumSuffixes` is `count;size`:
 
 .. code-block:: c++
 
@@ -223,7 +224,8 @@ is `count;size`:
     } Data;
   };
 
-When :option:`EnableCountingEnumHeuristic` is `true` and :option:`CountingEnumPrefixes` is `maxsize;last_`
+When :option:`EnableCountingEnumHeuristic` is `true` and
+:option:`CountingEnumPrefixes` is `maxsize;last_`
 
 .. code-block:: c++
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/throw-keyword-missing.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/throw-keyword-missing.rst
index 240a62ed6b33c..be80fc8a84832 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/throw-keyword-missing.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/throw-keyword-missing.rst
@@ -3,9 +3,10 @@
 bugprone-throw-keyword-missing
 ==============================
 
-Warns about a potentially missing ``throw`` keyword. If a temporary object is created, but the
-object's type derives from (or is the same as) a class that has 'EXCEPTION', 'Exception' or
-'exception' in its name, we can assume that the programmer's intention was to throw that object.
+Warns about a potentially missing ``throw`` keyword. If a temporary object
+is created, but the object's type derives from (or is the same as) a class
+that has 'EXCEPTION', 'Exception' or 'exception' in its name, we can assume
+that the programmer's intention was to throw that object.
 
 Example:
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/throwing-static-initialization.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/throwing-static-initialization.rst
index 5e320a109c39c..4f88719dd6f5c 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/throwing-static-initialization.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/throwing-static-initialization.rst
@@ -11,4 +11,4 @@ References
 
 This check corresponds to the CERT C++ Coding Standard rule
 `ERR58-CPP. Handle all exceptions thrown before main() begins executing
-<https://www.securecoding.cert.org/confluence/display/cplusplus/ERR58-CPP.+Handle+all+exceptions+thrown+before+main%28%29+begins+executing>`_.
\ No newline at end of file
+<https://www.securecoding.cert.org/confluence/display/cplusplus/ERR58-CPP.+Handle+all+exceptions+thrown+before+main%28%29+begins+executing>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst
index 077abf0af6880..efba0ccf97493 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst
@@ -14,10 +14,11 @@ iteration range.
     for (short i = 0; i < size; ++i) {}
   }
 
-This ``for`` loop is an infinite loop because the ``short`` type can't represent
-all values in the ``[0..size]`` interval.
+This ``for`` loop is an infinite loop because the ``short`` type can't
+represent all values in the ``[0..size]`` interval.
 
-In a real use case size means a container's size which depends on the user input.
+In a real use case size means a container's size which depends on the
+user input.
 
 .. code-block:: c++
 
@@ -29,8 +30,9 @@ This algorithm works for a small amount of objects, but will lead to freeze for
 a larger user input.
 
 It's recommended to enable the compiler warning
-`-Wtautological-constant-out-of-range-compare` as well, since check does not
-inspect compile-time constant loop boundaries to avoid overlaps with the warning.
+`-Wtautological-constant-out-of-range-compare` as well, since check does
+not inspect compile-time constant loop boundaries to avoid overlaps with
+the warning.
 
 Options
 -------
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst
index ebed79e339d4b..3423eaaf63eb2 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst
@@ -15,9 +15,10 @@ types collectively as ``optional<T>``.
 
 An access to the value of an ``optional<T>`` occurs when one of its ``value``,
 ``operator*``, or ``operator->`` member functions is invoked.  To align with
-common misconceptions, the check considers these member functions as equivalent,
-even though there are subtle differences related to exceptions versus undefined
-behavior. See *Additional notes*, below, for more information on this topic.
+common misconceptions, the check considers these member functions as
+equivalent, even though there are subtle differences related to exceptions
+versus undefined behavior. See *Additional notes*, below, for more information
+on this topic.
 
 An access to the value of an ``optional<T>`` is considered safe if and only if
 code in the local scope (for example, a function body) ensures that the
@@ -208,8 +209,8 @@ local variable and use that variable to access the value. For example:
 Do not rely on uncommon-API invariants
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-When uncommon APIs guarantee that an optional has contents, do not rely on it --
-instead, check explicitly that the optional object has a value. For example:
+When uncommon APIs guarantee that an optional has contents, do not rely on it
+-- instead, check explicitly that the optional object has a value. For example:
 
 .. code-block:: c++
 
@@ -293,8 +294,8 @@ or terminating the program), why treat it the same as ``operator*()`` which
 causes undefined behavior (UB)? That is, why is it considered unsafe to access
 an optional with ``value()``, if it's not provably populated with a value?  For
 that matter, why is ``CHECK()`` followed by ``operator*()`` any better than
-``value()``, given that they are semantically equivalent (on configurations that
-disable exceptions)?
+``value()``, given that they are semantically equivalent (on configurations
+that disable exceptions)?
 
 The answer is that we assume most users do not realize the difference between
 ``value()`` and ``operator*()``. Shifting to ``operator*()`` and some form of
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-string-to-number-conversion.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-string-to-number-conversion.rst
index c3ea196511367..e38b29817f30a 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-string-to-number-conversion.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-string-to-number-conversion.rst
@@ -5,8 +5,8 @@ bugprone-unchecked-string-to-number-conversion
 
 This check flags calls to string-to-number conversion functions that do not
 verify the validity of the conversion, such as ``atoi()`` or ``scanf()``. It
-does not flag calls to ``strtol()``, or other, related conversion functions that
-do perform better error checking.
+does not flag calls to ``strtol()``, or other, related conversion functions
+that do perform better error checking.
 
 .. code-block:: c
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst
index 3a6245d2fe35b..07c4b33048add 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst
@@ -10,9 +10,10 @@ Finds user-defined copy assignment operators which do not protect the code
 against self-assignment either by checking self-assignment explicitly or
 using the copy-and-swap or the copy-and-move method.
 
-By default, this check searches only those classes which have any pointer or C array field
-to avoid false positives. In case of a pointer or a C array, it's likely that self-copy
-assignment breaks the object if the copy assignment operator was not written with care.
+By default, this check searches only those classes which have any pointer or C
+array field to avoid false positives. In case of a pointer or a C array, it's
+likely that self-copy assignment breaks the object if the copy assignment
+operator was not written with care.
 
 See also:
 `OOP54-CPP. Gracefully handle self-copy assignment
@@ -90,9 +91,9 @@ The second one is the copy-and-swap method when we create a temporary copy
     }
   };
 
-There is a third pattern which is less common. Let's call it the copy-and-move method
-when we create a temporary copy (using the copy constructor) and then move this
-temporary object into ``this`` (needs a move assignment operator):
+There is a third pattern which is less common. Let's call it the copy-and-move
+method when we create a temporary copy (using the copy constructor) and then move
+this temporary object into ``this`` (needs a move assignment operator):
 
 .. code-block:: c++
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unintended-char-ostream-output.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unintended-char-ostream-output.rst
index 29254c4321f68..da510c472e039 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unintended-char-ostream-output.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unintended-char-ostream-output.rst
@@ -3,14 +3,14 @@
 bugprone-unintended-char-ostream-output
 =======================================
 
-Finds unintended character output from ``unsigned char`` and ``signed char`` to an
-``ostream``.
+Finds unintended character output from ``unsigned char`` and ``signed char`` to
+an ``ostream``.
 
-Normally, when ``unsigned char (uint8_t)`` or ``signed char (int8_t)`` is used, it
-is more likely a number than a character. However, when it is passed directly to
-``std::ostream``'s ``operator<<``, the result is the character output instead
-of the numeric value. This often contradicts the developer's intent to print
-integer values.
+Normally, when ``unsigned char (uint8_t)`` or ``signed char (int8_t)`` is used,
+it is more likely a number than a character. However, when it is passed
+directly to ``std::ostream``'s ``operator<<``, the result is the character
+output instead of the numeric value. This often contradicts the developer's
+intent to print integer values.
 
 .. code-block:: c++
 
@@ -26,8 +26,9 @@ intent, by default, it will cast to ``unsigned int`` for ``unsigned char`` and
   std::cout << static_cast<unsigned int>(v); // when v is unsigned char
   std::cout << static_cast<int>(v); // when v is signed char
 
-To avoid lengthy cast statements, add prefix ``+`` to the variable can also
-suppress warnings because unary expression will promote the value to an ``int``.
+To avoid lengthy cast statements, add prefix ``+`` to the variable can
+also suppress warnings because unary expression will promote the value
+to an ``int``.
 
 .. code-block:: c++
 
@@ -44,11 +45,11 @@ Options
 
 .. option:: AllowedTypes
 
-  A semicolon-separated list of type names that will be treated like the ``char``
-  type: the check will not report variables declared with with these types or
-  explicit cast expressions to these types. Note that this distinguishes type
-  aliases from the original type, so specifying e.g. ``unsigned char`` here
-  will not suppress reports about ``uint8_t`` even if it is defined as a
+  A semicolon-separated list of type names that will be treated like the ``char``
+  type: the check will not report variables declared with with these types or
+  explicit cast expressions to these types. Note that this distinguishes type
+  aliases from the original type, so specifying e.g. ``unsigned char`` here
+  will not suppress reports about ``uint8_t`` even if it is defined as a
   ``typedef`` alias for ``unsigned char``.
   Default is `unsigned char;signed char`.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
index 6937c5177b6c2..f1fec13739271 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unsafe-functions.rst
@@ -19,7 +19,8 @@ The check implements the following rules from the CERT C Coding Standard:
 Unsafe functions
 ----------------
 
-The following functions are reported if :option:`ReportDefaultFunctions` is enabled.
+The following functions are reported if :option:`ReportDefaultFunctions`
+is enabled.
 
 If *Annex K.* is available, a replacement from *Annex K.* is suggested for the
 following functions:
@@ -42,7 +43,8 @@ following functions from the previous list:
  - ``asctime``, ``asctime_r``, suggested replacement: ``strftime``
  - ``gets``, suggested replacement: ``fgets``
 
-The following functions are always checked, regardless of *Annex K* availability:
+The following functions are always checked, regardless of *Annex K*
+availability:
 
  - ``rewind``, suggested replacement: ``fseek``
  - ``setbuf``, suggested replacement: ``setvbuf``
@@ -80,8 +82,8 @@ including any system headers.
 Custom functions
 ----------------
 
-The option :option:`CustomFunctions` allows the user to define custom functions to be
-checked. The format is the following, without newlines:
+The option :option:`CustomFunctions` allows the user to define custom functions
+to be checked. The format is the following, without newlines:
 
 .. code::
 
@@ -94,8 +96,9 @@ checked. The format is the following, without newlines:
 The functions are matched using POSIX extended regular expressions.
 *(Note: The regular expressions do not support negative* ``(?!)`` *matches.)*
 
-The `reason` is optional and is used to provide additional information about the
-reasoning behind the replacement. The default reason is `is marked as unsafe`.
+The `reason` is optional and is used to provide additional information
+about the reasoning behind the replacement. The default reason is
+`is marked as unsafe`.
 
 If `replacement` is empty, the text `it should not be used` will be shown
 instead of the suggestion for a replacement.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst
index 10ae0fe3243a0..725403a6eb818 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst
@@ -3,7 +3,8 @@
 bugprone-unused-return-value
 ============================
 
-Warns on unused function return values. The checked functions can be configured.
+Warns on unused function return values. The checked functions can be
+configured.
 
 Operator overloading with assignment semantics are ignored.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/use-after-move.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/use-after-move.rst
index 965fc2d3c29e2..07edd07b1c4c9 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/use-after-move.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/use-after-move.rst
@@ -83,9 +83,9 @@ move:
       std::cout << str;
     }
 
-If you want to avoid the overhead of actually reinitializing the object, you can
-create a dummy function that causes the check to assume the object was
-reinitialized:
+If you want to avoid the overhead of actually reinitializing the object,
+you can create a dummy function that causes the check to assume the object
+was reinitialized:
 
 .. code-block:: c++
 
@@ -104,9 +104,9 @@ You can use this as follows:
       std::cout << str;
     }
 
-The check will not output a warning in this case because passing the object to a
-function as a non-const pointer or reference counts as a reinitialization (see section
-`Reinitialization`_ below).
+The check will not output a warning in this case because passing the object
+to a function as a non-const pointer or reference counts as a reinitialization
+(see section `Reinitialization`_ below).
 
 Unsequenced moves, uses, and reinitializations
 ----------------------------------------------
@@ -143,10 +143,10 @@ reference parameter.
 
 This means that the check will flag a use-after-move even on a type that does
 not define a move constructor or move assignment operator. This is intentional.
-Developers may use ``std::move`` on such a type in the expectation that the type
-will add move semantics in the future. If such a ``std::move`` has the potential
-to cause a use-after-move, we want to warn about it even if the type does not
-implement move semantics yet.
+Developers may use ``std::move`` on such a type in the expectation that the
+type will add move semantics in the future. If such a ``std::move`` has the
+potential to cause a use-after-move, we want to warn about it even if the type
+does not implement move semantics yet.
 
 Furthermore, if the result of ``std::move`` *is* passed to an rvalue reference
 parameter, this will always be considered to cause a move, even if the function
@@ -169,9 +169,9 @@ that a move always takes place:
 The check will assume that the last line causes a move, even though, in this
 particular case, it does not. Again, this is intentional.
 
-There is one special case: A call to ``std::move`` inside a ``try_emplace`` call
-is conservatively assumed not to move. This is to avoid spurious warnings, as
-the check has no way to reason about the ``bool`` returned by ``try_emplace``.
+There is one special case: A call to ``std::move`` inside a ``try_emplace``
+call is conservatively assumed not to move. This is to avoid spurious warnings,
+as the check has no way to reason about the ``bool`` returned by ``try_emplace``.
 
 When analyzing the order in which moves, uses and reinitializations happen (see
 section `Unsequenced moves, uses, and reinitializations`_), the move is assumed
@@ -198,8 +198,8 @@ is considered to be a use.
 An exception to this are objects of type ``std::unique_ptr``,
 ``std::shared_ptr``, ``std::weak_ptr``, ``std::optional``, and ``std::any``.
 An exception to this are objects of type ``std::unique_ptr``,
-``std::shared_ptr``, ``std::weak_ptr``, ``std::optional``, and ``std::any``, which
-can be reinitialized via ``reset``. For smart pointers specifically, the
+``std::shared_ptr``, ``std::weak_ptr``, ``std::optional``, and ``std::any``,
+which can be reinitialized via ``reset``. For smart pointers specifically, the
 moved-from objects have a well-defined state of being ``nullptr``s, and only
 ``operator*``, ``operator->`` and ``operator[]`` are considered bad accesses as
 they would be dereferencing a ``nullptr``.
@@ -217,10 +217,10 @@ The check considers a variable to be reinitialized in the following cases:
     lvalue reference. (It is assumed that the variable may be an out-parameter
     for the function.)
 
-  - ``clear()`` or ``assign()`` is called on the variable and the variable is of
-    one of the standard container types ``basic_string``, ``vector``, ``deque``,
-    ``forward_list``, ``list``, ``set``, ``map``, ``multiset``, ``multimap``,
-    ``unordered_set``, ``unordered_map``, ``unordered_multiset``,
+  - ``clear()`` or ``assign()`` is called on the variable and the variable is
+    of     one of the standard container types ``basic_string``, ``vector``,
+    ``deque``, ``forward_list``, ``list``, ``set``, ``map``, ``multiset``,
+    ``multimap``, ``unordered_set``, ``unordered_map``, ``unordered_multiset``,
     ``unordered_multimap``.
 
   - ``reset()`` is called on the variable and the variable is of type
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/virtual-near-miss.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/virtual-near-miss.rst
index b3f02b839c4a4..d42268455b0c2 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/virtual-near-miss.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/virtual-near-miss.rst
@@ -3,8 +3,9 @@
 bugprone-virtual-near-miss
 ==========================
 
-Warn if a function is a near miss (i.e. the name is very similar and the function
-signature is the same) to a virtual function from a base class.
+Warn if a function is a near miss (i.e. the name is very similar and
+the function signature is the same) to a virtual function from a base
+class.
 
 Example:
 

From d2f0b27ef520f56dbe002f9136a2702feb8a625d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 10 Nov 2025 13:30:49 +0200
Subject: [PATCH 07/29] Revert "[compiler-rt] Rename the now lone
 i386/chkstk2.S to i386/chkstk.S"

This reverts commit 1f9eff100ce8faea1284d68b779d844c6e019b77.

This is done in preparation of reverting parts of
885d7b759b5c166c07c07f4c58c6e0ba110fb0c2.
---
 compiler-rt/lib/builtins/CMakeLists.txt               | 2 +-
 compiler-rt/lib/builtins/i386/{chkstk.S => chkstk2.S} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename compiler-rt/lib/builtins/i386/{chkstk.S => chkstk2.S} (100%)

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 6c226aa7d2d48..8a80b95ac31ab 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -388,7 +388,7 @@ if (NOT MSVC)
   if (WIN32)
     set(i386_SOURCES
       ${i386_SOURCES}
-      i386/chkstk.S
+      i386/chkstk2.S
     )
   endif()
 else () # MSVC
diff --git a/compiler-rt/lib/builtins/i386/chkstk.S b/compiler-rt/lib/builtins/i386/chkstk2.S
similarity index 100%
rename from compiler-rt/lib/builtins/i386/chkstk.S
rename to compiler-rt/lib/builtins/i386/chkstk2.S

From 825706be7dcc8e00c4ee62a7f78d50b65db39e68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 10 Nov 2025 13:32:58 +0200
Subject: [PATCH 08/29] Revert "[compiler-rt] [builtins] Remove unused/misnamed
 x86 chkstk functions"

This reverts parts of commit 885d7b759b5c166c07c07f4c58c6e0ba110fb0c2,
and adds verbose comments explaining all the variants of this
function, for clarity for future readers.

It turns out that those functions actually weren't misnamed or
unused after all: Apparently Clang doesn't match GCC when it comes
to what stack probe function is referenced on i386 mingw. GCC < 4.6
references a symbol named "___chkstk", with three leading underscores,
and GCC >= 4.6 references "___chkstk_ms".

Restore these functions, to allow linking object files built with
GCC with compiler-rt.
---
 compiler-rt/lib/builtins/CMakeLists.txt |  1 +
 compiler-rt/lib/builtins/i386/chkstk.S  | 40 +++++++++++++++++++++++++
 compiler-rt/lib/builtins/i386/chkstk2.S | 18 +++++++++++
 3 files changed, 59 insertions(+)
 create mode 100644 compiler-rt/lib/builtins/i386/chkstk.S

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 8a80b95ac31ab..02e6ecfbdb60e 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -388,6 +388,7 @@ if (NOT MSVC)
   if (WIN32)
     set(i386_SOURCES
       ${i386_SOURCES}
+      i386/chkstk.S
       i386/chkstk2.S
     )
   endif()
diff --git a/compiler-rt/lib/builtins/i386/chkstk.S b/compiler-rt/lib/builtins/i386/chkstk.S
new file mode 100644
index 0000000000000..8ae7d39d66aba
--- /dev/null
+++ b/compiler-rt/lib/builtins/i386/chkstk.S
@@ -0,0 +1,40 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "../assembly.h"
+
+// _chkstk routine
+// This routine is windows specific
+// http://msdn.microsoft.com/en-us/library/ms648426.aspx
+//
+// This function does not decrement %esp at the end.
+
+// GCC after 4.6 generates calls to "___chkstk_ms". For other variants of
+// this function, which do decrement %esp, see chkstk2.S.
+
+#ifdef __i386__
+
+.text
+.balign 4
+DEFINE_COMPILERRT_FUNCTION(__chkstk_ms)
+        push   %ecx
+        push   %eax
+        cmp    $0x1000,%eax
+        lea    12(%esp),%ecx
+        jb     1f
+2:
+        sub    $0x1000,%ecx
+        test   %ecx,(%ecx)
+        sub    $0x1000,%eax
+        cmp    $0x1000,%eax
+        ja     2b
+1:
+        sub    %eax,%ecx
+        test   %ecx,(%ecx)
+        pop    %eax
+        pop    %ecx
+        ret
+END_COMPILERRT_FUNCTION(__chkstk_ms)
+
+#endif // __i386__
diff --git a/compiler-rt/lib/builtins/i386/chkstk2.S b/compiler-rt/lib/builtins/i386/chkstk2.S
index cdd9a4c2a5752..034b6edc6f1a4 100644
--- a/compiler-rt/lib/builtins/i386/chkstk2.S
+++ b/compiler-rt/lib/builtins/i386/chkstk2.S
@@ -11,9 +11,26 @@
 // This routine is windows specific
 // http://msdn.microsoft.com/en-us/library/ms648426.aspx
 
+// Clang on i386 mingw generates calls to "_alloca" (which gets decorated to
+// "__alloca").
+//
+// GCC before 4.6 generated calls a symbol which after decoration is named
+// "___chkstk", with three leading underscores. We provide that here as well.
+//
+// MSVC produces calls to the symbol "__chkstk", with two leading underscores.
+// That one has the same signature as this one - but we don't provide that
+// symbol here. (If we'd do that, we should do it in a separate object file
+// to avoid potential symbol collisions - see
+// commit 248aeac1ad2cf4f583490dd1312a5b448d2bb8cc for details.)
+//
+// GCC after 4.6 generates calls to "___chkstk_ms", which does not decrement
+// %esp - that function is defined in chkstk.S.
+
 .text
 .balign 4
 DEFINE_COMPILERRT_FUNCTION(_alloca) // _chkstk and _alloca are the same function
+// This gets decorated into "___chkstk"; GCC < 4.6 references this symbol.
+DEFINE_COMPILERRT_FUNCTION(__chkstk)
         push   %ecx
         cmp    $0x1000,%eax
         lea    8(%esp),%ecx     // esp before calling this routine -> ecx
@@ -34,6 +51,7 @@ DEFINE_COMPILERRT_FUNCTION(_alloca) // _chkstk and _alloca are the same function
         push   (%eax)           // push return address onto the stack
         sub    %esp,%eax        // restore the original value in eax
         ret
+END_COMPILERRT_FUNCTION(__chkstk)
 END_COMPILERRT_FUNCTION(_alloca)
 
 #endif // __i386__

From 693f700e1dba5d601d3ac999ff5471a3d0aaac70 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 13 Nov 2025 09:08:58 +0100
Subject: [PATCH 09/29] [libc++] Implement our own is{,x}digit functions for
 the C locale (#165467)

The C locale is defined by the C standard, so we know exactly which
digits classify as (x)digits. Instead of going through the locale base
API we can simply implement functions which determine whether a
character is one ourselves, and probably improve codegen significantly
as well that way.
---
 libcxx/include/__locale_dir/locale_base_api.h         |  5 -----
 libcxx/include/__locale_dir/num.h                     | 11 +++++++++--
 libcxx/include/__locale_dir/support/bsd_like.h        |  4 ----
 libcxx/include/__locale_dir/support/linux.h           |  4 ----
 .../__locale_dir/support/no_locale/characters.h       |  4 ----
 libcxx/include/__locale_dir/support/windows.h         |  4 ----
 6 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h
index 8c8f00061d1ed..fef90bb77991f 100644
--- a/libcxx/include/__locale_dir/locale_base_api.h
+++ b/libcxx/include/__locale_dir/locale_base_api.h
@@ -64,8 +64,6 @@
 // Character manipulation functions
 // --------------------------------
 // namespace __locale {
-//  int     __isdigit(int, __locale_t);  // required by the headers
-//  int     __isxdigit(int, __locale_t); // required by the headers
 //  int     __toupper(int, __locale_t);
 //  int     __tolower(int, __locale_t);
 //  int     __strcoll(const char*, const char*, __locale_t);
@@ -206,9 +204,6 @@ __strtoull(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
 //
 // Character manipulation functions
 //
-inline _LIBCPP_HIDE_FROM_ABI int __isdigit(int __ch, __locale_t __loc) { return isdigit_l(__ch, __loc); }
-inline _LIBCPP_HIDE_FROM_ABI int __isxdigit(int __ch, __locale_t __loc) { return isxdigit_l(__ch, __loc); }
-
 #    if defined(_LIBCPP_BUILDING_LIBRARY)
 inline _LIBCPP_HIDE_FROM_ABI int __strcoll(const char* __s1, const char* __s2, __locale_t __loc) {
   return strcoll_l(__s1, __s2, __loc);
diff --git a/libcxx/include/__locale_dir/num.h b/libcxx/include/__locale_dir/num.h
index ff357cd2d97db..e31a63d040f7a 100644
--- a/libcxx/include/__locale_dir/num.h
+++ b/libcxx/include/__locale_dir/num.h
@@ -749,6 +749,13 @@ void __num_put<_CharT>::__widen_and_group_int(
     __op = __ob + (__np - __nb);
 }
 
+_LIBCPP_HIDE_FROM_ABI inline bool __isdigit(char __c) { return __c >= '0' && __c <= '9'; }
+
+_LIBCPP_HIDE_FROM_ABI inline bool __isxdigit(char __c) {
+  auto __lower = __c | 0x20;
+  return std::__isdigit(__c) || (__lower >= 'a' && __lower <= 'f');
+}
+
 template <class _CharT>
 void __num_put<_CharT>::__widen_and_group_float(
     char* __nb, char* __np, char* __ne, _CharT* __ob, _CharT*& __op, _CharT*& __oe, const locale& __loc) {
@@ -764,11 +771,11 @@ void __num_put<_CharT>::__widen_and_group_float(
     *__oe++ = __ct.widen(*__nf++);
     *__oe++ = __ct.widen(*__nf++);
     for (__ns = __nf; __ns < __ne; ++__ns)
-      if (!__locale::__isxdigit(*__ns, _LIBCPP_GET_C_LOCALE))
+      if (!std::__isxdigit(*__ns))
         break;
   } else {
     for (__ns = __nf; __ns < __ne; ++__ns)
-      if (!__locale::__isdigit(*__ns, _LIBCPP_GET_C_LOCALE))
+      if (!std::__isdigit(*__ns))
         break;
   }
   if (__grouping.empty()) {
diff --git a/libcxx/include/__locale_dir/support/bsd_like.h b/libcxx/include/__locale_dir/support/bsd_like.h
index 9d4bdd1d5775f..27735529d5524 100644
--- a/libcxx/include/__locale_dir/support/bsd_like.h
+++ b/libcxx/include/__locale_dir/support/bsd_like.h
@@ -91,10 +91,6 @@ __strtoull(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
 //
 // Character manipulation functions
 //
-inline _LIBCPP_HIDE_FROM_ABI int __isdigit(int __c, __locale_t __loc) { return ::isdigit_l(__c, __loc); }
-
-inline _LIBCPP_HIDE_FROM_ABI int __isxdigit(int __c, __locale_t __loc) { return ::isxdigit_l(__c, __loc); }
-
 #if defined(_LIBCPP_BUILDING_LIBRARY)
 inline _LIBCPP_HIDE_FROM_ABI int __toupper(int __c, __locale_t __loc) { return ::toupper_l(__c, __loc); }
 
diff --git a/libcxx/include/__locale_dir/support/linux.h b/libcxx/include/__locale_dir/support/linux.h
index 23bcf44c31dbf..94a2ecb9a940d 100644
--- a/libcxx/include/__locale_dir/support/linux.h
+++ b/libcxx/include/__locale_dir/support/linux.h
@@ -116,10 +116,6 @@ __strtoull(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
 //
 // Character manipulation functions
 //
-inline _LIBCPP_HIDE_FROM_ABI int __isdigit(int __c, __locale_t __loc) { return isdigit_l(__c, __loc); }
-
-inline _LIBCPP_HIDE_FROM_ABI int __isxdigit(int __c, __locale_t __loc) { return isxdigit_l(__c, __loc); }
-
 #if defined(_LIBCPP_BUILDING_LIBRARY)
 inline _LIBCPP_HIDE_FROM_ABI int __toupper(int __c, __locale_t __loc) { return toupper_l(__c, __loc); }
 
diff --git a/libcxx/include/__locale_dir/support/no_locale/characters.h b/libcxx/include/__locale_dir/support/no_locale/characters.h
index 1281b8bd13094..73eba3ec542c7 100644
--- a/libcxx/include/__locale_dir/support/no_locale/characters.h
+++ b/libcxx/include/__locale_dir/support/no_locale/characters.h
@@ -29,10 +29,6 @@ namespace __locale {
 //
 // Character manipulation functions
 //
-inline _LIBCPP_HIDE_FROM_ABI int __isdigit(int __c, __locale_t) { return std::isdigit(__c); }
-
-inline _LIBCPP_HIDE_FROM_ABI int __isxdigit(int __c, __locale_t) { return std::isxdigit(__c); }
-
 #if defined(_LIBCPP_BUILDING_LIBRARY)
 inline _LIBCPP_HIDE_FROM_ABI int __toupper(int __c, __locale_t) { return std::toupper(__c); }
 
diff --git a/libcxx/include/__locale_dir/support/windows.h b/libcxx/include/__locale_dir/support/windows.h
index 0df8709f118d0..edd8a66c23e80 100644
--- a/libcxx/include/__locale_dir/support/windows.h
+++ b/libcxx/include/__locale_dir/support/windows.h
@@ -197,10 +197,6 @@ __strtoull(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
 //
 // Character manipulation functions
 //
-inline _LIBCPP_HIDE_FROM_ABI int __isdigit(int __c, __locale_t __loc) { return _isdigit_l(__c, __loc); }
-
-inline _LIBCPP_HIDE_FROM_ABI int __isxdigit(int __c, __locale_t __loc) { return _isxdigit_l(__c, __loc); }
-
 #if defined(_LIBCPP_BUILDING_LIBRARY)
 inline _LIBCPP_HIDE_FROM_ABI int __toupper(int __c, __locale_t __loc) { return ::_toupper_l(__c, __loc); }
 

From 2ac9e59d976de7f0dc4ebd2ecb7a17198b0d1ff4 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 13 Nov 2025 09:09:44 +0100
Subject: [PATCH 10/29] [libc++] Simplify the implementation of the unique_ptr
 -> shared_ptr converting constructor (#165619)

This also backports LWG2415 as a drive-by.
---
 libcxx/include/__memory/shared_ptr.h          | 41 +++----------------
 .../unique_ptr.pass.cpp                       |  4 +-
 2 files changed, 8 insertions(+), 37 deletions(-)

diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 67b94114988b5..87c9963036ca3 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -485,45 +485,16 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI shared_ptr {
 
   template <class _Yp,
             class _Dp,
-            __enable_if_t<!is_lvalue_reference<_Dp>::value && __compatible_with<_Yp, _Tp>::value &&
+            __enable_if_t<__compatible_with<_Yp, _Tp>::value &&
                               is_convertible<typename unique_ptr<_Yp, _Dp>::pointer, element_type*>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI shared_ptr(unique_ptr<_Yp, _Dp>&& __r) : __ptr_(__r.get()) {
-#if _LIBCPP_STD_VER >= 14
-    if (__ptr_ == nullptr)
-      __cntrl_ = nullptr;
-    else
-#endif
-    {
-      typedef typename __shared_ptr_default_allocator<_Yp>::type _AllocT;
-      typedef __shared_ptr_pointer<typename unique_ptr<_Yp, _Dp>::pointer, _Dp, _AllocT> _CntrlBlk;
-      __cntrl_ = new _CntrlBlk(__r.get(), std::move(__r.get_deleter()), _AllocT());
-      __enable_weak_this(__r.get(), __r.get());
-    }
-    __r.release();
-  }
+    using _AllocT   = typename __shared_ptr_default_allocator<_Yp>::type;
+    using _Deleter  = _If<is_lvalue_reference<_Dp>::value, reference_wrapper<__libcpp_remove_reference_t<_Dp> >, _Dp>;
+    using _CntrlBlk = __shared_ptr_pointer<typename unique_ptr<_Yp, _Dp>::pointer, _Deleter, _AllocT>;
 
-  template <class _Yp,
-            class _Dp,
-            class              = void,
-            __enable_if_t<is_lvalue_reference<_Dp>::value && __compatible_with<_Yp, _Tp>::value &&
-                              is_convertible<typename unique_ptr<_Yp, _Dp>::pointer, element_type*>::value,
-                          int> = 0>
-  _LIBCPP_HIDE_FROM_ABI shared_ptr(unique_ptr<_Yp, _Dp>&& __r) : __ptr_(__r.get()) {
-#if _LIBCPP_STD_VER >= 14
-    if (__ptr_ == nullptr)
-      __cntrl_ = nullptr;
-    else
-#endif
-    {
-      typedef typename __shared_ptr_default_allocator<_Yp>::type _AllocT;
-      typedef __shared_ptr_pointer<typename unique_ptr<_Yp, _Dp>::pointer,
-                                   reference_wrapper<__libcpp_remove_reference_t<_Dp> >,
-                                   _AllocT>
-          _CntrlBlk;
-      __cntrl_ = new _CntrlBlk(__r.get(), std::ref(__r.get_deleter()), _AllocT());
-      __enable_weak_this(__r.get(), __r.get());
-    }
+    __cntrl_ = __ptr_ ? new _CntrlBlk(__r.get(), std::forward<_Dp>(__r.get_deleter()), _AllocT()) : nullptr;
+    __enable_weak_this(__r.get(), __r.get());
     __r.release();
   }
 
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/unique_ptr.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/unique_ptr.pass.cpp
index 9308bb3858c65..f9562627fe122 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/unique_ptr.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/unique_ptr.pass.cpp
@@ -12,6 +12,8 @@
 
 // template <class Y, class D> shared_ptr(unique_ptr<Y, D>&&r);
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <memory>
 #include <new>
 #include <cstdlib>
@@ -165,12 +167,10 @@ int main(int, char**)
     { // LWG 2399
         fn(std::unique_ptr<int>(new int));
     }
-#if TEST_STD_VER >= 14
     { // LWG 2415
         std::unique_ptr<int, void (*)(int*)> p(nullptr, assert_deleter<int>);
         std::shared_ptr<int> p2(std::move(p)); // should not call deleter when going out of scope
     }
-#endif
 
     {
     adl::D d;

From 478e45fb94e541dfd3a53a23bbc8ed98337b8a77 Mon Sep 17 00:00:00 2001
From: Roger Sanders <sanders_roger@hotmail.com>
Date: Thu, 13 Nov 2025 19:13:41 +1100
Subject: [PATCH 11/29] [libc++] Improve performance of std::atomic_flag on
 Windows (#163524)

On Windows 8 and above, the WaitOnAddress, WakeByAddressSingle and
WakeByAddressAll functions allow efficient implementation of the C++20
wait and notify features of std::atomic_flag. These Windows functions
have never been made use of in libc++, leading to very poor performance
of these features on Windows platforms, as they are implemented using a
spin loop with backoff, rather than using any OS thread signalling
whatsoever. This change implements the use of these OS functions where
available, falling back to the original implementation on Windows
versions prior to 8.

Relevant API docs from Microsoft:

https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-waitonaddress

https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-wakebyaddresssingle

https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-wakebyaddressall

Fixes #127221
---
 libcxx/src/atomic.cpp | 69 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/libcxx/src/atomic.cpp b/libcxx/src/atomic.cpp
index b214ba1fd11c0..b9e4aa30bbdcd 100644
--- a/libcxx/src/atomic.cpp
+++ b/libcxx/src/atomic.cpp
@@ -41,6 +41,11 @@
 // OpenBSD has no indirect syscalls
 #  define _LIBCPP_FUTEX(...) futex(__VA_ARGS__)
 
+#elif defined(_WIN32)
+
+#  include <memory>
+#  include <windows.h>
+
 #else // <- Add other operating systems here
 
 // Baseline needs no new headers
@@ -101,6 +106,70 @@ static void __libcpp_platform_wake_by_address(__cxx_atomic_contention_t const vo
   _umtx_op(const_cast<__cxx_atomic_contention_t*>(__ptr), UMTX_OP_WAKE, __notify_one ? 1 : INT_MAX, nullptr, nullptr);
 }
 
+#elif defined(_WIN32)
+
+static void* win32_get_synch_api_function(const char* function_name) {
+  // Attempt to load the API set. Note that as per the Microsoft STL implementation, we assume this API is already
+  // loaded and accessible. While this isn't explicitly guaranteed by publicly available Win32 API documentation, it is
+  // true in practice, and may be guaranteed by internal documentation not released publicly. In any case the fact that
+  // the Microsoft STL made this assumption is reasonable basis to say that we can too. The alternative to this would be
+  // to use LoadLibrary, but then leak the module handle. We can't call FreeLibrary, as this would have to be triggered
+  // by a global static destructor, which would hang off DllMain, and calling FreeLibrary from DllMain is explicitly
+  // mentioned as not being allowed:
+  // https://learn.microsoft.com/en-us/windows/win32/dlls/dllmain
+  // Given the range of bad options here, we have chosen to mirror what Microsoft did, as it seems fair to assume that
+  // Microsoft will guarantee compatibility for us, as we are exposed to the same conditions as all existing Windows
+  // apps using the Microsoft STL VS2015/2017/2019/2022 runtimes, where Windows 7 support has not been excluded at
+  // compile time.
+  static auto module_handle = GetModuleHandleW(L"api-ms-win-core-synch-l1-2-0.dll");
+  if (module_handle == nullptr) {
+    return nullptr;
+  }
+
+  // Attempt to locate the function in the API and return the result to the caller. Note that the NULL return from this
+  // method is documented as being interchangeable with nullptr.
+  // https://devblogs.microsoft.com/oldnewthing/20180307-00/?p=98175
+  return reinterpret_cast<void*>(GetProcAddress(module_handle, function_name));
+}
+
+static void
+__libcpp_platform_wait_on_address(__cxx_atomic_contention_t const volatile* __ptr, __cxx_contention_t __val) {
+  // WaitOnAddress was added in Windows 8 (build 9200)
+  static auto wait_on_address = reinterpret_cast<BOOL(WINAPI*)(volatile void*, PVOID, SIZE_T, DWORD)>(
+      win32_get_synch_api_function("WaitOnAddress"));
+  if (wait_on_address != nullptr) {
+    wait_on_address(const_cast<__cxx_atomic_contention_t*>(__ptr), &__val, sizeof(__val), INFINITE);
+  } else {
+    __libcpp_thread_poll_with_backoff(
+        [=]() -> bool { return !__cxx_nonatomic_compare_equal(__cxx_atomic_load(__ptr, memory_order_relaxed), __val); },
+        __libcpp_timed_backoff_policy());
+  }
+}
+
+static void __libcpp_platform_wake_by_address(__cxx_atomic_contention_t const volatile* __ptr, bool __notify_one) {
+  if (__notify_one) {
+    // WakeByAddressSingle was added in Windows 8 (build 9200)
+    static auto wake_by_address_single =
+        reinterpret_cast<void(WINAPI*)(PVOID)>(win32_get_synch_api_function("WakeByAddressSingle"));
+    if (wake_by_address_single != nullptr) {
+      wake_by_address_single(const_cast<__cxx_atomic_contention_t*>(__ptr));
+    } else {
+      // The fallback implementation of waking does nothing, as the fallback wait implementation just does polling, so
+      // there's nothing to do here.
+    }
+  } else {
+    // WakeByAddressAll was added in Windows 8 (build 9200)
+    static auto wake_by_address_all =
+        reinterpret_cast<void(WINAPI*)(PVOID)>(win32_get_synch_api_function("WakeByAddressAll"));
+    if (wake_by_address_all != nullptr) {
+      wake_by_address_all(const_cast<__cxx_atomic_contention_t*>(__ptr));
+    } else {
+      // The fallback implementation of waking does nothing, as the fallback wait implementation just does polling, so
+      // there's nothing to do here.
+    }
+  }
+}
+
 #else // <- Add other operating systems here
 
 // Baseline is just a timed backoff

From 189d1853e484bab6e4f62bc8c2b509c5f020d795 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 13 Nov 2025 09:28:48 +0100
Subject: [PATCH 12/29] [libc++] Add an initial modulemap for the test support
 headers (#162800)

This should improve the time it takes to run the test suite a bit. Right
now there are only a handful of headers in the modulemap because we're
missing a lot of includes in the tests. New headers should be added
there from the start, and we should fill up the modulemap over time
until it contains all the test support headers.
---
 libcxx/include/module.modulemap.in                  |  5 ++++-
 .../copy_move_unwrap_reverse.pass.cpp               |  1 +
 libcxx/test/libcxx/memory/allocation_guard.pass.cpp |  2 ++
 .../memory/uninitialized_allocator_copy.pass.cpp    |  1 +
 .../alg.contains/ranges.contains_subrange.pass.cpp  |  1 +
 .../alg.nonmodifying/alg.count/count.pass.cpp       |  1 +
 .../alg.count/ranges.count.pass.cpp                 |  1 +
 .../alg.ends_with/ranges.ends_with.pass.cpp         |  2 ++
 .../alg.starts_with/ranges.starts_with.pass.cpp     |  1 +
 .../alg.partitions/pstl.is_partitioned.pass.cpp     |  1 +
 .../associative/map/map.cons/dtor_noexcept.pass.cpp |  3 ++-
 .../multimap/multimap.cons/dtor_noexcept.pass.cpp   |  3 ++-
 .../multiset/multiset.cons/dtor_noexcept.pass.cpp   |  3 ++-
 .../associative/set/set.cons/dtor_noexcept.pass.cpp |  3 ++-
 .../flat.map/flat.map.cons/copy_assign.pass.cpp     |  1 +
 .../flat.map/flat.map.cons/dtor_noexcept.pass.cpp   |  1 +
 .../container.adaptors/flat.map/helpers.h           |  1 +
 .../flat.multimap.cons/copy_assign.pass.cpp         |  1 +
 .../flat.multimap.cons/dtor_noexcept.pass.cpp       |  1 +
 .../container.adaptors/flat.multimap/helpers.h      |  1 +
 .../flat.multiset.cons/compare.pass.cpp             |  1 +
 .../flat.multiset.cons/copy_assign.pass.cpp         |  2 ++
 .../flat.multiset.cons/dtor_noexcept.pass.cpp       |  1 +
 .../flat.set/flat.set.cons/copy_assign.pass.cpp     |  1 +
 .../flat.set/flat.set.cons/dtor_noexcept.pass.cpp   |  1 +
 .../stack/stack.cons/ctor_iterators.pass.cpp        |  1 +
 .../allocator_move.pass.cpp                         |  1 +
 .../deque/deque.cons/dtor_noexcept.pass.cpp         |  3 ++-
 .../forwardlist.cons/dtor_noexcept.pass.cpp         |  3 ++-
 .../sequences/list/list.cons/dtor_noexcept.pass.cpp |  3 ++-
 .../sequences/vector.bool/assign_move.pass.cpp      |  1 +
 .../sequences/vector.bool/default_noexcept.pass.cpp |  3 ++-
 .../sequences/vector.bool/dtor_noexcept.pass.cpp    |  3 ++-
 .../vector.bool/move_assign_noexcept.pass.cpp       |  3 ++-
 .../sequences/vector.bool/move_noexcept.pass.cpp    |  3 ++-
 .../vector/vector.cons/dtor_noexcept.pass.cpp       |  3 ++-
 .../unord.map.cnstr/dtor_noexcept.pass.cpp          |  3 ++-
 .../unord.multimap.cnstr/dtor_noexcept.pass.cpp     |  3 ++-
 .../unord.multiset.cnstr/dtor_noexcept.pass.cpp     |  3 ++-
 .../unord.set.cnstr/dtor_noexcept.pass.cpp          |  3 ++-
 .../class.path/path.member/path.append.pass.cpp     |  1 +
 .../class.path/path.member/path.concat.pass.cpp     |  1 +
 libcxx/test/std/numerics/c.math/signbit.pass.cpp    |  1 +
 .../range.iota.view/indices.pass.cpp                |  3 +++
 .../re/re.results/re.results.const/move.pass.cpp    |  4 +++-
 .../strings/basic.string/string.cons/dtor.pass.cpp  |  5 +++--
 .../string.cons/iter_alloc_deduction.pass.cpp       |  1 +
 .../string.cons/string_view_deduction.pass.cpp      | 13 +++++++------
 .../string_view_size_size_deduction.pass.cpp        | 11 ++++++-----
 .../make_obj_using_allocator.pass.cpp               |  1 +
 ...uninitialized_construct_using_allocator.pass.cpp |  1 +
 .../uses_allocator_construction_args.pass.cpp       |  2 ++
 libcxx/test/support/module.modulemap                | 10 ++++++++++
 53 files changed, 102 insertions(+), 31 deletions(-)
 create mode 100644 libcxx/test/support/module.modulemap

diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 7ca57f6455dd8..ff9eb8c98a3e3 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -2361,7 +2361,10 @@ module std [system] {
   module hash_table           { header "__hash_table" }
   module node_handle          { header "__node_handle" }
   module split_buffer         { header "__split_buffer" }
-  module tree                 { header "__tree" }
+  module tree                 {
+    header "__tree"
+    export std.memory.unique_ptr
+  }
   module std_mbstate_t {
     header "__std_mbstate_t.h"
     export *
diff --git a/libcxx/test/libcxx/algorithms/alg.modifying.operations/copy_move_unwrap_reverse.pass.cpp b/libcxx/test/libcxx/algorithms/alg.modifying.operations/copy_move_unwrap_reverse.pass.cpp
index 2a85e7b5ddcc3..22444624dcba7 100644
--- a/libcxx/test/libcxx/algorithms/alg.modifying.operations/copy_move_unwrap_reverse.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.modifying.operations/copy_move_unwrap_reverse.pass.cpp
@@ -19,6 +19,7 @@
 #include <cstdint>
 #include <iterator>
 #include <type_traits>
+#include <utility>
 
 #include "test_iterators.h"
 
diff --git a/libcxx/test/libcxx/memory/allocation_guard.pass.cpp b/libcxx/test/libcxx/memory/allocation_guard.pass.cpp
index 493ebf044187c..a7c93972d656d 100644
--- a/libcxx/test/libcxx/memory/allocation_guard.pass.cpp
+++ b/libcxx/test/libcxx/memory/allocation_guard.pass.cpp
@@ -17,6 +17,8 @@
 
 #include <__memory/allocation_guard.h>
 #include <cassert>
+#include <climits>
+#include <memory>
 #include <type_traits>
 #include <utility>
 
diff --git a/libcxx/test/libcxx/memory/uninitialized_allocator_copy.pass.cpp b/libcxx/test/libcxx/memory/uninitialized_allocator_copy.pass.cpp
index 679ee86844687..1d127f947c1da 100644
--- a/libcxx/test/libcxx/memory/uninitialized_allocator_copy.pass.cpp
+++ b/libcxx/test/libcxx/memory/uninitialized_allocator_copy.pass.cpp
@@ -11,6 +11,7 @@
 // ensure that __uninitialized_allocator_copy calls the proper construct and destruct functions
 
 #include <algorithm>
+#include <cassert>
 #include <iterator>
 #include <memory>
 
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp
index 890ac23fff832..8354894493e21 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp
@@ -32,6 +32,7 @@
 
 #include "almost_satisfies_types.h"
 #include "test_iterators.h"
+#include "type_algorithms.h"
 
 struct NotEqualityComparable {};
 
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
index ffe3e0ef746c1..1561dcf8a6352 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
@@ -19,6 +19,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
+#include <cstdint>
 #include <vector>
 
 #include "sized_allocator.h"
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp
index 2c1346a743746..9d9d7ed5ab177 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp
@@ -26,6 +26,7 @@
 #include <array>
 #include <cassert>
 #include <cstddef>
+#include <cstdint>
 #include <ranges>
 #include <vector>
 
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.ends_with/ranges.ends_with.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.ends_with/ranges.ends_with.pass.cpp
index 199e6a786e5ba..76c62ffa760ad 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.ends_with/ranges.ends_with.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.ends_with/ranges.ends_with.pass.cpp
@@ -25,8 +25,10 @@
 #include <array>
 #include <chrono>
 #include <ranges>
+
 #include "almost_satisfies_types.h"
 #include "test_iterators.h"
+#include "type_algorithms.h"
 
 using namespace std::chrono;
 
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.starts_with/ranges.starts_with.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.starts_with/ranges.starts_with.pass.cpp
index 0f2284edde81c..172fa82fccc29 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.starts_with/ranges.starts_with.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.starts_with/ranges.starts_with.pass.cpp
@@ -27,6 +27,7 @@
 
 #include "almost_satisfies_types.h"
 #include "test_iterators.h"
+#include "type_algorithms.h"
 
 template <class Iter1, class Sent1 = Iter1, class Iter2 = int*, class Sent2 = Iter2>
 concept HasStartsWithIt = requires(Iter1 first1, Sent1 last1, Iter2 first2, Sent2 last2) {
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.partitions/pstl.is_partitioned.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.partitions/pstl.is_partitioned.pass.cpp
index a80e2f6ddc637..b64242dfc14b6 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.partitions/pstl.is_partitioned.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.partitions/pstl.is_partitioned.pass.cpp
@@ -20,6 +20,7 @@
 
 #include "test_iterators.h"
 #include "test_execution_policies.h"
+#include "type_algorithms.h"
 
 template <class Iter>
 struct Test {
diff --git a/libcxx/test/std/containers/associative/map/map.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/associative/map/map.cons/dtor_noexcept.pass.cpp
index 8497b942ad8eb..c9f7f281391f5 100644
--- a/libcxx/test/std/containers/associative/map/map.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.cons/dtor_noexcept.pass.cpp
@@ -12,8 +12,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <map>
 #include <cassert>
+#include <map>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "MoveOnly.h"
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.cons/dtor_noexcept.pass.cpp
index 62afae92b6902..c757befb9211c 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.cons/dtor_noexcept.pass.cpp
@@ -12,8 +12,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <map>
 #include <cassert>
+#include <map>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "MoveOnly.h"
diff --git a/libcxx/test/std/containers/associative/multiset/multiset.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/associative/multiset/multiset.cons/dtor_noexcept.pass.cpp
index 987eca0706076..bf5d256e19a0f 100644
--- a/libcxx/test/std/containers/associative/multiset/multiset.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/associative/multiset/multiset.cons/dtor_noexcept.pass.cpp
@@ -12,8 +12,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <set>
 #include <cassert>
+#include <set>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "MoveOnly.h"
diff --git a/libcxx/test/std/containers/associative/set/set.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/associative/set/set.cons/dtor_noexcept.pass.cpp
index 63c0433477414..a382ce629d284 100644
--- a/libcxx/test/std/containers/associative/set/set.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/associative/set/set.cons/dtor_noexcept.pass.cpp
@@ -12,8 +12,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <set>
 #include <cassert>
+#include <set>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "MoveOnly.h"
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy_assign.pass.cpp
index 8aa2e7bc539fd..c98803f6cce9d 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/copy_assign.pass.cpp
@@ -12,6 +12,7 @@
 
 // flat_map& operator=(const flat_map& m);
 
+#include <cassert>
 #include <deque>
 #include <flat_map>
 #include <functional>
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp
index 4562b01bc8c42..9e8198443387b 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp
@@ -17,6 +17,7 @@
 #include <flat_map>
 #include <functional>
 #include <vector>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "MoveOnly.h"
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.map/helpers.h
index 932f330db829e..445de4fcb6eae 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/helpers.h
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/helpers.h
@@ -15,6 +15,7 @@
 #include <vector>
 #include <flat_map>
 #include <ranges>
+#include <type_traits>
 
 #include "../flat_helpers.h"
 #include "test_allocator.h"
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp
index fd57a1061b615..ed040d5a3625a 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp
@@ -12,6 +12,7 @@
 
 // flat_multimap& operator=(const flat_multimap& m);
 
+#include <cassert>
 #include <deque>
 #include <flat_map>
 #include <functional>
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp
index 104d56755bd76..2534a4748aa76 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp
@@ -16,6 +16,7 @@
 #include <deque>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h
index f3edd3b3a0242..ccb3218ebfc41 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h
@@ -15,6 +15,7 @@
 #include <vector>
 #include <flat_map>
 #include <ranges>
+#include <type_traits>
 
 #include "../flat_helpers.h"
 #include "test_allocator.h"
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp
index 43ebea740f66c..81f9bbcfb1199 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp
@@ -14,6 +14,7 @@
 // template <class Alloc>
 //   flat_multiset(const key_compare& comp, const Alloc& a);
 
+#include <cassert>
 #include <deque>
 #include <flat_set>
 #include <functional>
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp
index 2e63a004ffa88..de297f1f8f92a 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp
@@ -13,9 +13,11 @@
 // flat_multiset& operator=(const flat_multiset& m);
 
 #include <algorithm>
+#include <cassert>
 #include <deque>
 #include <flat_set>
 #include <functional>
+#include <utility>
 #include <vector>
 
 #include "operator_hijacker.h"
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp
index f7243fa7e7fb3..0df06672a6ba3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp
@@ -16,6 +16,7 @@
 #include <deque>
 #include <flat_set>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/containers/container.adaptors/flat.set/flat.set.cons/copy_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.set/flat.set.cons/copy_assign.pass.cpp
index 59caa9c1f8a48..33fe457d7c1e3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.set/flat.set.cons/copy_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.set/flat.set.cons/copy_assign.pass.cpp
@@ -13,6 +13,7 @@
 // flat_set& operator=(const flat_set& m);
 
 #include <algorithm>
+#include <cassert>
 #include <deque>
 #include <flat_set>
 #include <functional>
diff --git a/libcxx/test/std/containers/container.adaptors/flat.set/flat.set.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.set/flat.set.cons/dtor_noexcept.pass.cpp
index 810b13b0a5b0b..1caf58fff72d3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.set/flat.set.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.set/flat.set.cons/dtor_noexcept.pass.cpp
@@ -16,6 +16,7 @@
 #include <deque>
 #include <flat_set>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/containers/container.adaptors/stack/stack.cons/ctor_iterators.pass.cpp b/libcxx/test/std/containers/container.adaptors/stack/stack.cons/ctor_iterators.pass.cpp
index 5981f9189dbfb..afd5a63895b80 100644
--- a/libcxx/test/std/containers/container.adaptors/stack/stack.cons/ctor_iterators.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/stack/stack.cons/ctor_iterators.pass.cpp
@@ -15,6 +15,7 @@
 
 #include <cassert>
 #include <stack>
+#include <type_traits>
 
 #include "test_allocator.h"
 
diff --git a/libcxx/test/std/containers/container.requirements/container.requirements.general/allocator_move.pass.cpp b/libcxx/test/std/containers/container.requirements/container.requirements.general/allocator_move.pass.cpp
index 5de5579d13067..e927bfc83aea4 100644
--- a/libcxx/test/std/containers/container.requirements/container.requirements.general/allocator_move.pass.cpp
+++ b/libcxx/test/std/containers/container.requirements/container.requirements.general/allocator_move.pass.cpp
@@ -13,6 +13,7 @@
 //   belonging to the container being moved. Such move construction of the
 //   allocator shall not exit via an exception.
 
+#include <cassert>
 #include <vector>
 #include <deque>
 #include <list>
diff --git a/libcxx/test/std/containers/sequences/deque/deque.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/sequences/deque/deque.cons/dtor_noexcept.pass.cpp
index f0a839484f9dc..338a0fa6b832d 100644
--- a/libcxx/test/std/containers/sequences/deque/deque.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/sequences/deque/deque.cons/dtor_noexcept.pass.cpp
@@ -12,8 +12,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <deque>
 #include <cassert>
+#include <deque>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "MoveOnly.h"
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/dtor_noexcept.pass.cpp
index 64c60af1fdcb7..8d474cf966339 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/dtor_noexcept.pass.cpp
@@ -12,8 +12,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <forward_list>
 #include <cassert>
+#include <forward_list>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "MoveOnly.h"
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/dtor_noexcept.pass.cpp
index 44e6ddd722a70..4ebe19fafef29 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/dtor_noexcept.pass.cpp
@@ -12,8 +12,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <list>
 #include <cassert>
+#include <list>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "MoveOnly.h"
diff --git a/libcxx/test/std/containers/sequences/vector.bool/assign_move.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/assign_move.pass.cpp
index 8791380b134c7..2200367fb27f7 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/assign_move.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/assign_move.pass.cpp
@@ -14,6 +14,7 @@
 // vector& operator=(vector&& c);
 
 #include <cassert>
+#include <utility>
 #include <vector>
 
 #include "min_allocator.h"
diff --git a/libcxx/test/std/containers/sequences/vector.bool/default_noexcept.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/default_noexcept.pass.cpp
index 0801709625090..a95d358248f56 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/default_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/default_noexcept.pass.cpp
@@ -16,8 +16,9 @@
 // For vector<>, this was added to the standard by N4258,
 //   but vector<bool> was not changed.
 
-#include <vector>
 #include <cassert>
+#include <vector>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "test_allocator.h"
diff --git a/libcxx/test/std/containers/sequences/vector.bool/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/dtor_noexcept.pass.cpp
index f8f3c76f8c3d9..6a6ca6b36ce5d 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/dtor_noexcept.pass.cpp
@@ -12,8 +12,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <vector>
 #include <cassert>
+#include <vector>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "test_allocator.h"
diff --git a/libcxx/test/std/containers/sequences/vector.bool/move_assign_noexcept.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/move_assign_noexcept.pass.cpp
index 5a69213c3b4ef..e5add73b167d0 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/move_assign_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/move_assign_noexcept.pass.cpp
@@ -17,8 +17,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <vector>
 #include <cassert>
+#include <vector>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "test_allocator.h"
diff --git a/libcxx/test/std/containers/sequences/vector.bool/move_noexcept.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/move_noexcept.pass.cpp
index d0d231f48953a..5bdae01f871f9 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/move_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/move_noexcept.pass.cpp
@@ -15,8 +15,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <vector>
 #include <cassert>
+#include <vector>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "test_allocator.h"
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/dtor_noexcept.pass.cpp
index 4a7e2cf445676..331b360c2fa1d 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.cons/dtor_noexcept.pass.cpp
@@ -12,8 +12,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <vector>
 #include <cassert>
+#include <vector>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "MoveOnly.h"
diff --git a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/dtor_noexcept.pass.cpp
index 5797599b1f9e4..cf148fd75732b 100644
--- a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/dtor_noexcept.pass.cpp
@@ -12,8 +12,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <unordered_map>
 #include <cassert>
+#include <unordered_map>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "MoveOnly.h"
diff --git a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/dtor_noexcept.pass.cpp
index 6fdc9e3eb7f5e..2771e64b36ea8 100644
--- a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/dtor_noexcept.pass.cpp
@@ -12,8 +12,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <unordered_map>
 #include <cassert>
+#include <unordered_map>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "MoveOnly.h"
diff --git a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/dtor_noexcept.pass.cpp
index 32c757e16d344..c48c2865b844f 100644
--- a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/dtor_noexcept.pass.cpp
@@ -12,8 +12,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <unordered_set>
 #include <cassert>
+#include <unordered_set>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "MoveOnly.h"
diff --git a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/dtor_noexcept.pass.cpp
index 17cfae0f989c7..2939e36dd6e0d 100644
--- a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/dtor_noexcept.pass.cpp
@@ -12,8 +12,9 @@
 
 // UNSUPPORTED: c++03
 
-#include <unordered_set>
 #include <cassert>
+#include <unordered_set>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "MoveOnly.h"
diff --git a/libcxx/test/std/input.output/filesystems/class.path/path.member/path.append.pass.cpp b/libcxx/test/std/input.output/filesystems/class.path/path.member/path.append.pass.cpp
index b3d87ee630873..65c4dabe47d66 100644
--- a/libcxx/test/std/input.output/filesystems/class.path/path.member/path.append.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.path/path.member/path.append.pass.cpp
@@ -32,6 +32,7 @@
 #include <type_traits>
 #include <string_view>
 #include <cassert>
+#include <utility>
 
 // On Windows, the append function converts all inputs (pointers, iterators)
 // to an intermediate path object, causing allocations in cases where no
diff --git a/libcxx/test/std/input.output/filesystems/class.path/path.member/path.concat.pass.cpp b/libcxx/test/std/input.output/filesystems/class.path/path.member/path.concat.pass.cpp
index 96de72b355f3f..d4a32735dc05a 100644
--- a/libcxx/test/std/input.output/filesystems/class.path/path.member/path.concat.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.path/path.member/path.concat.pass.cpp
@@ -39,6 +39,7 @@
 #include <string>
 #include <string_view>
 #include <cassert>
+#include <utility>
 
 // On Windows, charset conversions cause allocations in the path class in
 // cases where no allocations are done on other platforms.
diff --git a/libcxx/test/std/numerics/c.math/signbit.pass.cpp b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
index 233e8ed2338b6..5655370e60bb0 100644
--- a/libcxx/test/std/numerics/c.math/signbit.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
@@ -20,6 +20,7 @@
 #include <cassert>
 #include <cmath>
 #include <limits>
+#include <type_traits>
 
 #include "test_macros.h"
 #include "type_algorithms.h"
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/indices.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/indices.pass.cpp
index d92b6cb876a40..b70471b25d32b 100644
--- a/libcxx/test/std/ranges/range.factories/range.iota.view/indices.pass.cpp
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/indices.pass.cpp
@@ -12,6 +12,9 @@
 
 // inline constexpr unspecified indices = unspecified;
 
+// FIXME: This test shouldn't define TEST_HAS_NO_INT128
+// ADDITIONAL_COMPILE_FLAGS(clang-modules-build): -fno-modules
+
 #include <cassert>
 #include <cstddef>
 #include <ranges>
diff --git a/libcxx/test/std/re/re.results/re.results.const/move.pass.cpp b/libcxx/test/std/re/re.results/re.results.const/move.pass.cpp
index 0806edef1429b..9078d575d531c 100644
--- a/libcxx/test/std/re/re.results/re.results.const/move.pass.cpp
+++ b/libcxx/test/std/re/re.results/re.results.const/move.pass.cpp
@@ -14,8 +14,10 @@
 //
 //  Additionally, the stored Allocator value is move constructed from m.get_allocator().
 
-#include <regex>
 #include <cassert>
+#include <regex>
+#include <utility>
+
 #include "test_macros.h"
 #include "test_allocator.h"
 
diff --git a/libcxx/test/std/strings/basic.string/string.cons/dtor.pass.cpp b/libcxx/test/std/strings/basic.string/string.cons/dtor.pass.cpp
index e9f174068473d..d5c1f4d9348b8 100644
--- a/libcxx/test/std/strings/basic.string/string.cons/dtor.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.cons/dtor.pass.cpp
@@ -12,11 +12,12 @@
 
 // ~basic_string() // implied noexcept; // constexpr since C++20
 
-#include <string>
 #include <cassert>
+#include <string>
+#include <type_traits>
 
-#include "test_macros.h"
 #include "test_allocator.h"
+#include "test_macros.h"
 
 template <class T>
 struct throwing_alloc {
diff --git a/libcxx/test/std/strings/basic.string/string.cons/iter_alloc_deduction.pass.cpp b/libcxx/test/std/strings/basic.string/string.cons/iter_alloc_deduction.pass.cpp
index d9176da63d0dc..d47c9dfded226 100644
--- a/libcxx/test/std/strings/basic.string/string.cons/iter_alloc_deduction.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.cons/iter_alloc_deduction.pass.cpp
@@ -25,6 +25,7 @@
 #include <iterator>
 #include <string>
 #include <type_traits>
+#include <utility>
 
 #include "test_macros.h"
 #include "test_allocator.h"
diff --git a/libcxx/test/std/strings/basic.string/string.cons/string_view_deduction.pass.cpp b/libcxx/test/std/strings/basic.string/string.cons/string_view_deduction.pass.cpp
index 3a6f84f2699ac..a19564b98b283 100644
--- a/libcxx/test/std/strings/basic.string/string.cons/string_view_deduction.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.cons/string_view_deduction.pass.cpp
@@ -19,17 +19,18 @@
 //  The deduction guide shall not participate in overload resolution if Allocator
 //  is a type that does not qualify as an allocator.
 
-#include <string>
-#include <string_view>
+#include <cassert>
+#include <cstddef>
 #include <iterator>
 #include <memory>
+#include <string>
+#include <string_view>
 #include <type_traits>
-#include <cassert>
-#include <cstddef>
+#include <utility>
 
-#include "test_macros.h"
-#include "test_allocator.h"
 #include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
 
 template <class StringView, class Allocator, class = void>
 struct CanDeduce : std::false_type {};
diff --git a/libcxx/test/std/strings/basic.string/string.cons/string_view_size_size_deduction.pass.cpp b/libcxx/test/std/strings/basic.string/string.cons/string_view_size_size_deduction.pass.cpp
index 08e696b7a091b..e36503d92be18 100644
--- a/libcxx/test/std/strings/basic.string/string.cons/string_view_size_size_deduction.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.cons/string_view_size_size_deduction.pass.cpp
@@ -25,15 +25,16 @@
 //  The deduction guide shall not participate in overload resolution if Allocator
 //  is a type that does not qualify as an allocator.
 
-#include <string>
-#include <string_view>
-#include <iterator>
 #include <cassert>
 #include <cstddef>
+#include <iterator>
+#include <string>
+#include <string_view>
+#include <utility>
 
-#include "test_macros.h"
-#include "test_allocator.h"
 #include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
 
 template <class StringView, class Size, class Allocator, class = void>
 struct CanDeduce : std::false_type {};
diff --git a/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/make_obj_using_allocator.pass.cpp b/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/make_obj_using_allocator.pass.cpp
index 1f605140b4def..744e530191cdc 100644
--- a/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/make_obj_using_allocator.pass.cpp
+++ b/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/make_obj_using_allocator.pass.cpp
@@ -14,6 +14,7 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
+#include <cassert>
 #include <concepts>
 #include <memory>
 #include <tuple>
diff --git a/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/uninitialized_construct_using_allocator.pass.cpp b/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/uninitialized_construct_using_allocator.pass.cpp
index 07260e540d957..329698c4371c0 100644
--- a/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/uninitialized_construct_using_allocator.pass.cpp
+++ b/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/uninitialized_construct_using_allocator.pass.cpp
@@ -14,6 +14,7 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
+#include <cassert>
 #include <concepts>
 #include <memory>
 #include <tuple>
diff --git a/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/uses_allocator_construction_args.pass.cpp b/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/uses_allocator_construction_args.pass.cpp
index 93a445d7c9e06..aa3a5e8a28685 100644
--- a/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/uses_allocator_construction_args.pass.cpp
+++ b/libcxx/test/std/utilities/memory/allocator.uses/allocator.uses.construction/uses_allocator_construction_args.pass.cpp
@@ -14,10 +14,12 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
+#include <cassert>
 #include <concepts>
 #include <memory>
 #include <ranges>
 #include <tuple>
+#include <type_traits>
 #include <utility>
 
 #include "common.h"
diff --git a/libcxx/test/support/module.modulemap b/libcxx/test/support/module.modulemap
new file mode 100644
index 0000000000000..75ae8c4221ae0
--- /dev/null
+++ b/libcxx/test/support/module.modulemap
@@ -0,0 +1,10 @@
+
+module test_config {
+  module test_macros { textual header "test_macros.h" }
+}
+
+module test {
+  module double_move_tracker    { header "double_move_tracker.h" }
+  module test_allocator         { header "test_allocator.h" }
+  module type_algorithms        { header "type_algorithms.h" }
+}

From f038dfd22d111303d928af4cbcad37498c020c4e Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 13 Nov 2025 09:29:28 +0100
Subject: [PATCH 13/29] [libc++] Merge is_{,un}bounded_array.h into is_array.h
 (#167479)

These headers are incredibly simple and closely related, so this merges
them into a single one.
---
 libcxx/include/CMakeLists.txt                 |  2 -
 libcxx/include/__memory/shared_ptr.h          |  2 -
 .../__memory/uninitialized_algorithms.h       |  1 -
 libcxx/include/__memory/unique_ptr.h          |  2 -
 libcxx/include/__type_traits/is_array.h       | 26 +++++++++++++
 .../include/__type_traits/is_bounded_array.h  | 36 ------------------
 .../__type_traits/is_unbounded_array.h        | 38 -------------------
 libcxx/include/module.modulemap.in            |  8 ----
 libcxx/include/optional                       |  1 -
 libcxx/include/type_traits                    |  2 -
 10 files changed, 26 insertions(+), 92 deletions(-)
 delete mode 100644 libcxx/include/__type_traits/is_bounded_array.h
 delete mode 100644 libcxx/include/__type_traits/is_unbounded_array.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 131ba99357d62..4b2713191c1c0 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -822,7 +822,6 @@ set(files
   __type_traits/is_array.h
   __type_traits/is_assignable.h
   __type_traits/is_base_of.h
-  __type_traits/is_bounded_array.h
   __type_traits/is_callable.h
   __type_traits/is_char_like_type.h
   __type_traits/is_class.h
@@ -872,7 +871,6 @@ set(files
   __type_traits/is_trivially_destructible.h
   __type_traits/is_trivially_lexicographically_comparable.h
   __type_traits/is_trivially_relocatable.h
-  __type_traits/is_unbounded_array.h
   __type_traits/is_union.h
   __type_traits/is_unqualified.h
   __type_traits/is_unsigned.h
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 87c9963036ca3..ee07efe081e51 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -41,13 +41,11 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_array.h>
-#include <__type_traits/is_bounded_array.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_same.h>
-#include <__type_traits/is_unbounded_array.h>
 #include <__type_traits/nat.h>
 #include <__type_traits/negation.h>
 #include <__type_traits/remove_cv.h>
diff --git a/libcxx/include/__memory/uninitialized_algorithms.h b/libcxx/include/__memory/uninitialized_algorithms.h
index 34d065dc973e5..9182db4b412e3 100644
--- a/libcxx/include/__memory/uninitialized_algorithms.h
+++ b/libcxx/include/__memory/uninitialized_algorithms.h
@@ -32,7 +32,6 @@
 #include <__type_traits/is_trivially_assignable.h>
 #include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_relocatable.h>
-#include <__type_traits/is_unbounded_array.h>
 #include <__type_traits/remove_const.h>
 #include <__type_traits/remove_extent.h>
 #include <__utility/exception_guard.h>
diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h
index 491d1c2e42417..3cf4b97a7f49c 100644
--- a/libcxx/include/__memory/unique_ptr.h
+++ b/libcxx/include/__memory/unique_ptr.h
@@ -32,7 +32,6 @@
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_assignable.h>
-#include <__type_traits/is_bounded_array.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
@@ -42,7 +41,6 @@
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/is_trivially_relocatable.h>
-#include <__type_traits/is_unbounded_array.h>
 #include <__type_traits/is_void.h>
 #include <__type_traits/remove_extent.h>
 #include <__type_traits/type_identity.h>
diff --git a/libcxx/include/__type_traits/is_array.h b/libcxx/include/__type_traits/is_array.h
index e734d1a3043ee..62dd378cec79b 100644
--- a/libcxx/include/__type_traits/is_array.h
+++ b/libcxx/include/__type_traits/is_array.h
@@ -26,6 +26,32 @@ template <class _Tp>
 _LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_array_v = __is_array(_Tp);
 #endif
 
+template <class _Tp>
+inline const bool __is_bounded_array_v = __is_bounded_array(_Tp);
+
+#if _LIBCPP_STD_VER >= 20
+
+template <class _Tp>
+struct _LIBCPP_NO_SPECIALIZATIONS is_bounded_array : bool_constant<__is_bounded_array(_Tp)> {};
+
+template <class _Tp>
+_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_bounded_array_v = __is_bounded_array(_Tp);
+
+#endif
+
+template <class _Tp>
+inline const bool __is_unbounded_array_v = __is_unbounded_array(_Tp);
+
+#if _LIBCPP_STD_VER >= 20
+
+template <class _Tp>
+struct _LIBCPP_NO_SPECIALIZATIONS is_unbounded_array : bool_constant<__is_unbounded_array(_Tp)> {};
+
+template <class _Tp>
+_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_unbounded_array_v = __is_unbounded_array(_Tp);
+
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_IS_ARRAY_H
diff --git a/libcxx/include/__type_traits/is_bounded_array.h b/libcxx/include/__type_traits/is_bounded_array.h
deleted file mode 100644
index 8a41e07aa019b..0000000000000
--- a/libcxx/include/__type_traits/is_bounded_array.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_BOUNDED_ARRAY_H
-#define _LIBCPP___TYPE_TRAITS_IS_BOUNDED_ARRAY_H
-
-#include <__config>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-inline const bool __is_bounded_array_v = __is_bounded_array(_Tp);
-
-#if _LIBCPP_STD_VER >= 20
-
-template <class _Tp>
-struct _LIBCPP_NO_SPECIALIZATIONS is_bounded_array : bool_constant<__is_bounded_array(_Tp)> {};
-
-template <class _Tp>
-_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_bounded_array_v = __is_bounded_array(_Tp);
-
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_BOUNDED_ARRAY_H
diff --git a/libcxx/include/__type_traits/is_unbounded_array.h b/libcxx/include/__type_traits/is_unbounded_array.h
deleted file mode 100644
index e14809e26a787..0000000000000
--- a/libcxx/include/__type_traits/is_unbounded_array.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_UNBOUNDED_ARRAY_H
-#define _LIBCPP___TYPE_TRAITS_IS_UNBOUNDED_ARRAY_H
-
-#include <__config>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class>
-inline const bool __is_unbounded_array_v = false;
-template <class _Tp>
-inline const bool __is_unbounded_array_v<_Tp[]> = true;
-
-#if _LIBCPP_STD_VER >= 20
-
-template <class _Tp>
-struct _LIBCPP_NO_SPECIALIZATIONS is_unbounded_array : bool_constant<__is_unbounded_array_v<_Tp>> {};
-
-template <class _Tp>
-_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_unbounded_array_v = __is_unbounded_array_v<_Tp>;
-
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_UNBOUNDED_ARRAY_H
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index ff9eb8c98a3e3..57d66cd1ccaef 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -126,10 +126,6 @@ module std_core [system] {
       header "__type_traits/is_base_of.h"
       export std_core.type_traits.integral_constant
     }
-    module is_bounded_array {
-      header "__type_traits/is_bounded_array.h"
-      export std_core.type_traits.integral_constant
-    }
     module is_callable {
       header "__type_traits/is_callable.h"
       export std_core.type_traits.integral_constant
@@ -323,10 +319,6 @@ module std_core [system] {
       header "__type_traits/is_trivially_relocatable.h"
       export std_core.type_traits.integral_constant
     }
-    module is_unbounded_array {
-      header "__type_traits/is_unbounded_array.h"
-      export std_core.type_traits.integral_constant
-    }
     module is_union {
       header "__type_traits/is_union.h"
       export std_core.type_traits.integral_constant
diff --git a/libcxx/include/optional b/libcxx/include/optional
index ad672f6a9914f..23b21364b1a79 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -238,7 +238,6 @@ namespace std {
 #  include <__type_traits/is_trivially_constructible.h>
 #  include <__type_traits/is_trivially_destructible.h>
 #  include <__type_traits/is_trivially_relocatable.h>
-#  include <__type_traits/is_unbounded_array.h>
 #  include <__type_traits/negation.h>
 #  include <__type_traits/reference_constructs_from_temporary.h>
 #  include <__type_traits/remove_const.h>
diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits
index dab0c0640c389..f3e397e4df80c 100644
--- a/libcxx/include/type_traits
+++ b/libcxx/include/type_traits
@@ -550,9 +550,7 @@ namespace std
 
 #  if _LIBCPP_STD_VER >= 20
 #    include <__type_traits/common_reference.h>
-#    include <__type_traits/is_bounded_array.h>
 #    include <__type_traits/is_constant_evaluated.h>
-#    include <__type_traits/is_unbounded_array.h>
 #    include <__type_traits/type_identity.h>
 #    include <__type_traits/unwrap_ref.h>
 #  endif

From d2a2b169f6cdfc2864f83ebc221dc771a9a1697d Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 13 Nov 2025 08:57:19 +0000
Subject: [PATCH 14/29] [libunwind] Ensure zaDisable() is called in
 jumpto/returnto (NFC) (#167674)

This is an NFC for now, as the SME checks for macOS platforms are not
implemented, so zaDisable() is a no-op, but both paths for resuming from
an exception should disable ZA.

This is a fixup for a recent change in #165066.
---
 libunwind/src/Registers.hpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/libunwind/src/Registers.hpp b/libunwind/src/Registers.hpp
index 28649fafb23d5..45a2b0921ea3b 100644
--- a/libunwind/src/Registers.hpp
+++ b/libunwind/src/Registers.hpp
@@ -1862,16 +1862,13 @@ class _LIBUNWIND_HIDDEN Registers_arm64 {
   v128        getVectorRegister(int num) const;
   void        setVectorRegister(int num, v128 value);
   static const char *getRegisterName(int num);
+  void        jumpto(unsigned walkedFrames = 0) {
+    zaDisable();
+    __libunwind_Registers_arm64_jumpto(this, walkedFrames);
+  }
 #ifdef _LIBUNWIND_TRACE_RET_INJECT
   _LIBUNWIND_TRACE_NO_INLINE
-    void      returnto(unsigned walkedFrames) {
-      __libunwind_Registers_arm64_jumpto(this, walkedFrames);
-    }
-#else
-  void        jumpto() {
-      zaDisable();
-      __libunwind_Registers_arm64_jumpto(this, 0);
-  }
+  void        returnto(unsigned walkedFrames) { jumpto(walkedFrames); }
 #endif
   static constexpr int lastDwarfRegNum() {
     return _LIBUNWIND_HIGHEST_DWARF_REGISTER_ARM64;

From 70eb4b0a6093bd0c5e03d6ff0f715b4b258ffa96 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Thu, 13 Nov 2025 09:58:43 +0100
Subject: [PATCH 15/29] [clang][bytecode] Fix diagnosing subtration of
 zero-size pointers (#167839)

We need to get the element type size at bytecode generation time to
check. We also need to diagnose this in the LHS == RHS case.
---
 clang/lib/AST/ByteCode/Compiler.cpp    |  9 +++++++-
 clang/lib/AST/ByteCode/Interp.h        | 32 ++++++++++++--------------
 clang/lib/AST/ByteCode/Opcodes.td      |  1 +
 clang/test/AST/ByteCode/arrays.cpp     |  4 ++++
 clang/test/AST/ByteCode/new-delete.cpp | 10 ++++++++
 5 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 1243380ca8a6b..dfc6250afedd7 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -1039,8 +1039,15 @@ bool Compiler<Emitter>::VisitPointerArithBinOp(const BinaryOperator *E) {
     if (!visitAsPointer(RHS, *RT) || !visitAsPointer(LHS, *LT))
       return false;
 
+    QualType ElemType = LHS->getType()->getPointeeType();
+    CharUnits ElemTypeSize;
+    if (ElemType->isVoidType() || ElemType->isFunctionType())
+      ElemTypeSize = CharUnits::One();
+    else
+      ElemTypeSize = Ctx.getASTContext().getTypeSizeInChars(ElemType);
+
     PrimType IntT = classifyPrim(E->getType());
-    if (!this->emitSubPtr(IntT, E))
+    if (!this->emitSubPtr(IntT, ElemTypeSize.isZero(), E))
       return false;
     return DiscardResult ? this->emitPop(IntT, E) : true;
   }
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index cbd60c9f2b37c..19c8d6d850339 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -2390,7 +2390,7 @@ static inline bool DecPtr(InterpState &S, CodePtr OpPC) {
 /// 2) Pops another Pointer from the stack.
 /// 3) Pushes the difference of the indices of the two pointers on the stack.
 template <PrimType Name, class T = typename PrimConv<Name>::T>
-inline bool SubPtr(InterpState &S, CodePtr OpPC) {
+inline bool SubPtr(InterpState &S, CodePtr OpPC, bool ElemSizeIsZero) {
   const Pointer &LHS = S.Stk.pop<Pointer>();
   const Pointer &RHS = S.Stk.pop<Pointer>();
 
@@ -2402,25 +2402,23 @@ inline bool SubPtr(InterpState &S, CodePtr OpPC) {
     return false;
   }
 
-  if (LHS == RHS) {
-    S.Stk.push<T>();
-    return true;
-  }
+  if (ElemSizeIsZero) {
+    QualType PtrT = LHS.getType();
+    while (auto *AT = dyn_cast<ArrayType>(PtrT))
+      PtrT = AT->getElementType();
 
-  for (const Pointer &P : {LHS, RHS}) {
-    if (P.isZeroSizeArray()) {
-      QualType PtrT = P.getType();
-      while (auto *AT = dyn_cast<ArrayType>(PtrT))
-        PtrT = AT->getElementType();
+    QualType ArrayTy = S.getASTContext().getConstantArrayType(
+        PtrT, APInt::getZero(1), nullptr, ArraySizeModifier::Normal, 0);
+    S.FFDiag(S.Current->getSource(OpPC),
+             diag::note_constexpr_pointer_subtraction_zero_size)
+        << ArrayTy;
 
-      QualType ArrayTy = S.getASTContext().getConstantArrayType(
-          PtrT, APInt::getZero(1), nullptr, ArraySizeModifier::Normal, 0);
-      S.FFDiag(S.Current->getSource(OpPC),
-               diag::note_constexpr_pointer_subtraction_zero_size)
-          << ArrayTy;
+    return false;
+  }
 
-      return false;
-    }
+  if (LHS == RHS) {
+    S.Stk.push<T>();
+    return true;
   }
 
   int64_t A64 =
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 1785fcf4a7b20..ddf1a8fcc98b1 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -543,6 +543,7 @@ def SubOffset : AluOpcode;
 // [Pointer, Pointer] -> [Integral]
 def SubPtr : Opcode {
   let Types = [IntegerTypeClass];
+  let Args = [ArgBool];
   let HasGroup = 1;
 }
 
diff --git a/clang/test/AST/ByteCode/arrays.cpp b/clang/test/AST/ByteCode/arrays.cpp
index eaf9559e40cda..d83ae97fc8213 100644
--- a/clang/test/AST/ByteCode/arrays.cpp
+++ b/clang/test/AST/ByteCode/arrays.cpp
@@ -731,6 +731,10 @@ namespace ZeroSizeTypes {
                              // both-note {{subtraction of pointers to type 'int[0]' of zero size}} \
                              // both-warning {{subtraction of pointers to type 'int[0]' of zero size has undefined behavior}}
 
+  constexpr int k2 = p1 - p1; // both-error {{constexpr variable 'k2' must be initialized by a constant expression}} \
+                              // both-note {{subtraction of pointers to type 'int[0]' of zero size}} \
+                              // both-warning {{subtraction of pointers to type 'int[0]' of zero size has undefined behavior}}
+
   int arr[5][0];
   constexpr int f() { // both-error {{never produces a constant expression}}
     return &arr[3] - &arr[0]; // both-note {{subtraction of pointers to type 'int[0]' of zero size}} \
diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp
index f54854070573c..9e0f33e212c18 100644
--- a/clang/test/AST/ByteCode/new-delete.cpp
+++ b/clang/test/AST/ByteCode/new-delete.cpp
@@ -1104,6 +1104,16 @@ namespace HugeAllocation {
 }
 #endif
 
+namespace ZeroSizeArray {
+  constexpr int foo() {
+    int *A = new int[0];
+    int diff = A - (&A[0]);
+    delete[] A;
+    return diff;
+  }
+  static_assert(foo() == 0);
+}
+
 #else
 /// Make sure we reject this prior to C++20
 constexpr int a() { // both-error {{never produces a constant expression}}

From f2ed002192da1a1f5001a0a015d491574bc53bfb Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 13 Nov 2025 17:11:19 +0800
Subject: [PATCH 16/29] [RISCV] Fix RISCVInsertVSETVLI coalescing clobbering VL
 def segment (#167712)

This fixes an assert when compiling llvm-test-suite with -march=rva23u64
-O3 that started appearing sometime this week.

We get "Cannot overlap two segments with differing ValID's" because we
try to coalescse these two vsetvlis:

    %x:gprnox0 = COPY $x8
dead $x0 = PseudoVSETIVLI 1, 208, implicit-def $vl, implicit-def $vtype
    %y:gprnox0 = COPY %x
    %v:vr = COPY $v8, implicit $vtype
    %x = PseudoVSETVLI %x, 208, implicit-def $vl, implicit-def $vtype

    -->

    %x:gprnox0 = COPY $x8
    %x = PseudoVSETVLI %x, 208, implicit-def $vl, implicit-def $vtype
    %y:gprnox0 = COPY %x
    %v:vr = COPY $v8, implicit $vtype

However to do so would cause us to extend the segment of the new value
of %x up past the first segment, which overlaps.

This fixes it by checking that its safe to extend the segment, by simply
making sure the interval isn't live at the first vsetvli.

This unfortunately causes a regression in the existing
coalesce_vl_avl_same_reg test because even though we could coalesce the
vsetvlis there, we now bail. I couldn't think of an easy way to handle
this safely, but I don't think this is an important case to handle:
After testing this patch on SPEC CPU 2017 there are no codegen changes.
---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp  |  8 ++++
 llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll | 43 +++++++++++++++++++
 .../test/CodeGen/RISCV/rvv/vsetvli-insert.mir | 27 ++++++++++++
 3 files changed, 78 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index bf9de0a4b5604..e5819d90526d9 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1755,6 +1755,14 @@ bool RISCVInsertVSETVLI::canMutatePriorConfig(
       if (!VNI || !PrevVNI || VNI != PrevVNI)
         return false;
     }
+
+    // If we define VL and need to move the definition up, check we can extend
+    // the live interval upwards from MI to PrevMI.
+    Register VL = MI.getOperand(0).getReg();
+    if (VL.isVirtual() && LIS &&
+        LIS->getInterval(VL).overlaps(LIS->getInstructionIndex(PrevMI),
+                                      LIS->getInstructionIndex(MI)))
+      return false;
   }
 
   assert(PrevMI.getOperand(2).isImm() && MI.getOperand(2).isImm());
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
index b6e29cf76cd48..e8d89d4066e43 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
@@ -879,3 +879,46 @@ entry:
   %3 = tail call { <vscale x 8 x i8>, i64 } @llvm.riscv.vleff(<vscale x 8 x i8> poison, ptr %p, i64 %2)
   ret void
 }
+
+; This will create a live interval in such a way we can't coalesce two vsetvlis,
+; see the corresponding .mir test for more details. Make sure we check for this
+; and don't crash.
+define void @coalesce_vl_clobber(ptr %p) {
+; CHECK-LABEL: coalesce_vl_clobber:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a2, 0
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    vsetivli zero, 0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmclr.m v8
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:  .LBB43_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    slli a3, a1, 32
+; CHECK-NEXT:    vsetvli a1, a2, e8, mf8, ta, ma
+; CHECK-NEXT:    vsetivli zero, 0, e8, mf2, ta, mu
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    srli a3, a3, 32
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vslideup.vx v10, v9, a3, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vse32.v v10, (a0), v0.t
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    j .LBB43_1
+entry:
+  br label %vector.body
+
+vector.body:
+  %avl = phi i64 [ 0, %entry ], [ 1, %vector.body ]
+  %prev.evl = phi i32 [ 0, %entry ], [ %0, %vector.body ]
+  %0 = tail call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 1, i1 true)
+  %1 = tail call <vscale x 4 x i1> @llvm.experimental.vp.splice(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 0, <vscale x 4 x i1> zeroinitializer, i32 %prev.evl, i32 0)
+  tail call void @llvm.vp.store(<vscale x 4 x float> zeroinitializer, ptr %p, <vscale x 4 x i1> %1, i32 %0)
+  br label %vector.body
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
index f9929c9caf712..396ca517e4017 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
@@ -100,6 +100,10 @@
     ret void
   }
 
+  define void @coalesce_vl_clobber() {
+    ret void
+  }
+
   define void @vsetvli_vleff() {
     ret void
   }
@@ -624,10 +628,33 @@ body: |
     ; CHECK: liveins: $x8, $v8
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %x:gprnox0 = COPY $x8
+    ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 1, 208 /* e32, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead %v:vr = COPY $v8, implicit $vtype
     ; CHECK-NEXT: dead %x:gprnox0 = PseudoVSETVLI %x, 208 /* e32, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    %x:gprnox0 = COPY $x8
+    dead $x0 = PseudoVSETIVLI 1, 208, implicit-def $vl, implicit-def $vtype
+    %v:vr = COPY $v8, implicit $vtype
+    %x = PseudoVSETVLI %x, 208, implicit-def $vl, implicit-def $vtype
+...
+---
+# Because of the %y:gprnox0 = COPY %x, we can't extend the live range of %x from
+# the second vsetvli to the first vsetvli when coalescing.
+name: coalesce_vl_clobber
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $x8, $v8
+    ; CHECK-LABEL: name: coalesce_vl_clobber
+    ; CHECK: liveins: $x8, $v8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x:gprnox0 = COPY $x8
+    ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 1, 208 /* e32, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead %y:gprnox0 = COPY %x
     ; CHECK-NEXT: dead %v:vr = COPY $v8, implicit $vtype
+    ; CHECK-NEXT: dead %x:gprnox0 = PseudoVSETVLI %x, 208 /* e32, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
     %x:gprnox0 = COPY $x8
     dead $x0 = PseudoVSETIVLI 1, 208, implicit-def $vl, implicit-def $vtype
+    %y:gprnox0 = COPY %x
     %v:vr = COPY $v8, implicit $vtype
     %x = PseudoVSETVLI %x, 208, implicit-def $vl, implicit-def $vtype
 ...

From 295a3f725153cb1e8e0b257c6409d2e12b904d0e Mon Sep 17 00:00:00 2001
From: Ryan Mansfield <ryan_mansfield@apple.com>
Date: Thu, 13 Nov 2025 04:31:37 -0500
Subject: [PATCH 17/29]  [yaml2obj][MachO] Fix crash from integer underflow
 with invalid cmdsize (#165924)

yaml2obj would crash when processing Mach-O load commands with cmdsize
smaller than the actual structure size e.g. LC_SEGMENT_64 with
cmdsize=56 instead of 72. The crash occurred due to integer underflow
when calculating padding: cmdsize - BytesWritten wraps to a large value
when negative, causing a massive allocation attempt.
---
 llvm/lib/ObjectYAML/MachOEmitter.cpp          | 37 ++++++++++++-
 .../MachO/load-cmdsize-too-small.yaml         | 55 +++++++++++++++++++
 2 files changed, 89 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/ObjectYAML/MachO/load-cmdsize-too-small.yaml

diff --git a/llvm/lib/ObjectYAML/MachOEmitter.cpp b/llvm/lib/ObjectYAML/MachOEmitter.cpp
index 35d442e8e3437..46c91811d0a67 100644
--- a/llvm/lib/ObjectYAML/MachOEmitter.cpp
+++ b/llvm/lib/ObjectYAML/MachOEmitter.cpp
@@ -19,14 +19,26 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/SystemZ/zOSSupport.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 namespace {
 
+static const char *getLoadCommandName(uint32_t cmd) {
+  switch (cmd) {
+#define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
+  case MachO::LCName:                                                          \
+    return #LCName;
+#include "llvm/BinaryFormat/MachO.def"
+  default:
+    return nullptr;
+  }
+}
+
 class MachOWriter {
 public:
   MachOWriter(MachOYAML::Object &Obj) : Obj(Obj), fileStart(0) {
@@ -244,7 +256,8 @@ void MachOWriter::ZeroToOffset(raw_ostream &OS, size_t Offset) {
 }
 
 void MachOWriter::writeLoadCommands(raw_ostream &OS) {
-  for (auto &LC : Obj.LoadCommands) {
+  for (size_t i = 0; i < Obj.LoadCommands.size(); ++i) {
+    auto &LC = Obj.LoadCommands[i];
     size_t BytesWritten = 0;
     llvm::MachO::macho_load_command Data = LC.Data;
 
@@ -285,7 +298,25 @@ void MachOWriter::writeLoadCommands(raw_ostream &OS) {
 
     // Fill remaining bytes with 0. This will only get hit in partially
     // specified test cases.
-    auto BytesRemaining = LC.Data.load_command_data.cmdsize - BytesWritten;
+    // Prevent integer underflow if BytesWritten exceeds cmdsize.
+    if (BytesWritten > LC.Data.load_command_data.cmdsize) {
+      std::string Name;
+      const char *NameCStr = getLoadCommandName(LC.Data.load_command_data.cmd);
+      if (NameCStr)
+        Name = NameCStr;
+      else
+        Name = ("(0x" + Twine::utohexstr(LC.Data.load_command_data.cmd) + ")")
+                   .str();
+
+      WithColor::warning() << "load command " << i << " " << Name
+                           << " cmdsize too small ("
+                           << LC.Data.load_command_data.cmdsize
+                           << " bytes) for actual size (" << BytesWritten
+                           << " bytes)\n";
+    }
+    auto BytesRemaining = (BytesWritten < LC.Data.load_command_data.cmdsize)
+                              ? LC.Data.load_command_data.cmdsize - BytesWritten
+                              : 0;
     if (BytesRemaining > 0) {
       ZeroFillBytes(OS, BytesRemaining);
     }
diff --git a/llvm/test/ObjectYAML/MachO/load-cmdsize-too-small.yaml b/llvm/test/ObjectYAML/MachO/load-cmdsize-too-small.yaml
new file mode 100644
index 0000000000000..ef11711e0487e
--- /dev/null
+++ b/llvm/test/ObjectYAML/MachO/load-cmdsize-too-small.yaml
@@ -0,0 +1,55 @@
+## Test that yaml2obj handles load commands with cmdsize smaller than the
+## actual structure size without crashing (due to integer underflow).
+
+## Test with a known load command (LC_SEGMENT_64).
+# RUN: yaml2obj %s --docnum=1 -o %t1 2>&1 | FileCheck %s --check-prefix=WARNING-KNOWN
+# RUN: not llvm-readobj --file-headers %t1 2>&1 | FileCheck %s --check-prefix=MALFORMED
+
+# WARNING-KNOWN: warning: load command 0 LC_SEGMENT_64 cmdsize too small (56 bytes) for actual size (72 bytes)
+
+# MALFORMED: error: {{.*}}: truncated or malformed object (load command 0 LC_SEGMENT_64 cmdsize too small)
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x00000001
+  ncmds:           1
+  sizeofcmds:      56
+  flags:           0x00002000
+  reserved:        0x00000000
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         56          ## Should be 72 for LC_SEGMENT_64
+    segname:         '__TEXT'
+    vmaddr:          0x1000
+    vmsize:          0x10
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        5
+    nsects:          0
+    flags:           0
+...
+
+## Test with an unknown load command value.
+# RUN: yaml2obj %s --docnum=2 -o %t2 2>&1 | FileCheck %s --check-prefix=WARNING-UNKNOWN
+
+# WARNING-UNKNOWN: warning: load command 0 (0xdeadbeef) cmdsize too small (8 bytes) for actual size (20 bytes)
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x00000001
+  ncmds:           1
+  sizeofcmds:      20
+  flags:           0x00002000
+  reserved:        0x00000000
+LoadCommands:
+  - cmd:             0xDEADBEEF
+    cmdsize:         8
+    PayloadBytes:    [0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C]
+...

From 4d10c1165442cbbbc0017b48fcdd7dae1ccf3678 Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Thu, 13 Nov 2025 10:32:03 +0100
Subject: [PATCH 18/29] Reland [MS][clang] Add support for vector deleting
 destructors (#165598)

MSVC supports an extension allowing to delete an array of objects via
pointer whose static type doesn't match its dynamic type. This is done
via generation of special destructors - vector deleting destructors.
MSVC's virtual tables always contain a pointer to the vector deleting
destructor for classes with virtual destructors, so not having this
extension implemented causes clang to generate code that is not
compatible with the code generated by MSVC, because clang always puts a
pointer to a scalar deleting destructor to the vtable. As a bonus the
deletion of an array of polymorphic object will work just like it does
with MSVC - no memory leaks and correct destructors are called.

This patch will cause clang to emit code that is compatible with code
produced by MSVC but not compatible with code produced with clang of
older versions, so the new behavior can be disabled via passing
-fclang-abi-compat=21 (or lower).

This is yet another attempt to land vector deleting destructors support
originally implemented by
https://github.com/llvm/llvm-project/pull/133451.

This PR contains fixes for issues reported in the original PR as well as
fixes for issues related to operator delete[] search reported in several
issues like

https://github.com/llvm/llvm-project/pull/133950#issuecomment-2787510484
https://github.com/llvm/llvm-project/issues/134265

Fixes https://github.com/llvm/llvm-project/issues/19772
---
 clang/docs/ReleaseNotes.rst                   |   8 +
 clang/include/clang/AST/ASTContext.h          |  22 ++
 clang/include/clang/AST/ASTMutationListener.h |   9 +
 clang/include/clang/AST/DeclCXX.h             |  16 +-
 clang/include/clang/AST/VTableBuilder.h       |   6 +-
 clang/include/clang/Basic/ABI.h               |  11 +-
 clang/include/clang/Basic/TargetInfo.h        |   5 +
 clang/include/clang/Sema/Sema.h               |   3 +-
 clang/include/clang/Serialization/ASTWriter.h |   4 +
 clang/lib/AST/ASTContext.cpp                  |  65 ++++
 clang/lib/AST/DeclCXX.cpp                     |  73 +++-
 clang/lib/AST/Expr.cpp                        |   3 +
 clang/lib/AST/ItaniumMangle.cpp               |   2 +
 clang/lib/AST/MicrosoftMangle.cpp             |  22 +-
 clang/lib/AST/VTableBuilder.cpp               |  18 +-
 clang/lib/Basic/TargetInfo.cpp                |   7 +
 clang/lib/CodeGen/CGCXX.cpp                   |  37 +-
 clang/lib/CodeGen/CGCXXABI.cpp                |  14 +
 clang/lib/CodeGen/CGCXXABI.h                  |   6 +
 clang/lib/CodeGen/CGClass.cpp                 |  95 ++++-
 clang/lib/CodeGen/CGDebugInfo.cpp             |   8 +-
 clang/lib/CodeGen/CGExprCXX.cpp               |  52 ++-
 clang/lib/CodeGen/CGVTables.cpp               |   4 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |  50 +++
 clang/lib/CodeGen/CodeGenModule.h             |   6 +
 clang/lib/CodeGen/ItaniumCXXABI.cpp           |   5 +-
 clang/lib/CodeGen/MicrosoftCXXABI.cpp         |  70 +++-
 clang/lib/Sema/SemaDeclCXX.cpp                |  30 +-
 clang/lib/Sema/SemaExprCXX.cpp                |  13 +-
 clang/lib/Serialization/ASTCommon.h           |   4 +-
 clang/lib/Serialization/ASTReaderDecl.cpp     |  66 +++-
 clang/lib/Serialization/ASTWriter.cpp         |  36 ++
 clang/lib/Serialization/ASTWriterDecl.cpp     |   2 +
 clang/test/CodeGenCXX/dllexport.cpp           |   5 +-
 .../microsoft-abi-extern-template.cpp         |   2 +-
 .../CodeGenCXX/microsoft-abi-structors.cpp    |   2 +-
 .../test/CodeGenCXX/microsoft-abi-thunks.cpp  |   3 +-
 .../CodeGenCXX/microsoft-abi-vftables.cpp     |  20 +-
 .../microsoft-abi-virtual-inheritance.cpp     |  17 +-
 ...multiple-nonvirtual-inheritance-vdtors.cpp |  18 +-
 .../microsoft-abi-vtables-return-thunks.cpp   |   2 +-
 ...crosoft-abi-vtables-single-inheritance.cpp |  20 +-
 ...-vtables-virtual-inheritance-vtordisps.cpp |  30 +-
 ...rosoft-abi-vtables-virtual-inheritance.cpp |  18 +-
 .../CodeGenCXX/microsoft-no-rtti-data.cpp     |   2 +-
 .../microsoft-vector-deleting-dtors.cpp       | 336 ++++++++++++++++++
 clang/test/CodeGenCXX/vtable-consteval.cpp    |   4 +-
 clang/test/DebugInfo/CXX/windows-dtor.cpp     |   2 +-
 .../module.modulemap                          |   1 +
 .../msvc-vector-deleting-dtors.h              |  16 +
 .../msvc-vector-deleting-destructors.cpp      |  30 ++
 clang/test/Modules/vtable-windows.cppm        |   2 +-
 .../PCH/Inputs/msvc-vector-deleting-dtors.h   |  16 +
 .../PCH/msvc-vector-deleting-destructors.cpp  |  34 ++
 clang/test/Profile/cxx-abc-deleting-dtor.cpp  |   9 +-
 clang/test/SemaCXX/gh134265.cpp               |  62 ++++
 .../SymbolFile/DWARF/SymbolFileDWARF.cpp      |   1 +
 57 files changed, 1260 insertions(+), 164 deletions(-)
 create mode 100644 clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp
 create mode 100644 clang/test/Modules/Inputs/msvc-vector-deleting-dtors/module.modulemap
 create mode 100644 clang/test/Modules/Inputs/msvc-vector-deleting-dtors/msvc-vector-deleting-dtors.h
 create mode 100644 clang/test/Modules/msvc-vector-deleting-destructors.cpp
 create mode 100644 clang/test/PCH/Inputs/msvc-vector-deleting-dtors.h
 create mode 100644 clang/test/PCH/msvc-vector-deleting-destructors.cpp
 create mode 100644 clang/test/SemaCXX/gh134265.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 88a05affebf9e..b3273e39a6279 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -82,6 +82,12 @@ Potentially Breaking Changes
 - Downstream projects that previously linked only against ``clangDriver`` may
   now (also) need to link against the new ``clangOptions`` library, since
   options-related code has been moved out of the Driver into a separate library.
+- Clang now supports MSVC vector deleting destructors when targeting Windows.
+  This means that vtables of classes with virtual destructors will contain a
+  pointer to vector deleting destructor (instead of scalar deleting destructor)
+  which in fact is a different symbol with different name and linkage. This
+  may cause runtime failures if two binaries using the same class defining a
+  virtual destructor are compiled with different versions of clang.
 
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
@@ -588,6 +594,8 @@ Android Support
 Windows Support
 ^^^^^^^^^^^^^^^
 
+- Clang now supports MSVC vector deleting destructors (GH19772).
+
 LoongArch Support
 ^^^^^^^^^^^^^^^^^
 - Enable linker relaxation by default for loongarch64.
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 33aa2d343aa7a..6e9e737dcae4f 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -370,6 +370,18 @@ class ASTContext : public RefCountedBase<ASTContext> {
   mutable llvm::DenseSet<const FunctionDecl *> DestroyingOperatorDeletes;
   mutable llvm::DenseSet<const FunctionDecl *> TypeAwareOperatorNewAndDeletes;
 
+  /// Global and array operators delete are only required for MSVC deleting
+  /// destructors support. Store them here to avoid keeping 4 pointers that are
+  /// not always used in each redeclaration of the destructor.
+  mutable llvm::DenseMap<const CXXDestructorDecl *, FunctionDecl *>
+      OperatorDeletesForVirtualDtor;
+  mutable llvm::DenseMap<const CXXDestructorDecl *, FunctionDecl *>
+      GlobalOperatorDeletesForVirtualDtor;
+  mutable llvm::DenseMap<const CXXDestructorDecl *, FunctionDecl *>
+      ArrayOperatorDeletesForVirtualDtor;
+  mutable llvm::DenseMap<const CXXDestructorDecl *, FunctionDecl *>
+      GlobalArrayOperatorDeletesForVirtualDtor;
+
   /// The next string literal "version" to allocate during constant evaluation.
   /// This is used to distinguish between repeated evaluations of the same
   /// string literal.
@@ -3473,6 +3485,16 @@ class ASTContext : public RefCountedBase<ASTContext> {
                                          bool IsTypeAware);
   bool isTypeAwareOperatorNewOrDelete(const FunctionDecl *FD) const;
 
+  enum OperatorDeleteKind { Regular, GlobalRegular, Array, ArrayGlobal };
+
+  void addOperatorDeleteForVDtor(const CXXDestructorDecl *Dtor,
+                                 FunctionDecl *OperatorDelete,
+                                 OperatorDeleteKind K) const;
+  FunctionDecl *getOperatorDeleteForVDtor(const CXXDestructorDecl *Dtor,
+                                          OperatorDeleteKind K) const;
+  bool dtorHasOperatorDelete(const CXXDestructorDecl *Dtor,
+                             OperatorDeleteKind K) const;
+
   /// Retrieve the context for computing mangling numbers in the given
   /// DeclContext.
   MangleNumberingContext &getManglingNumberContext(const DeclContext *DC);
diff --git a/clang/include/clang/AST/ASTMutationListener.h b/clang/include/clang/AST/ASTMutationListener.h
index 352af42391782..c8448a25c23a4 100644
--- a/clang/include/clang/AST/ASTMutationListener.h
+++ b/clang/include/clang/AST/ASTMutationListener.h
@@ -90,6 +90,15 @@ class ASTMutationListener {
   virtual void ResolvedOperatorGlobDelete(const CXXDestructorDecl *DD,
                                           const FunctionDecl *GlobDelete) {}
 
+  /// A virtual destructor's operator array delete has been resolved.
+  virtual void ResolvedOperatorArrayDelete(const CXXDestructorDecl *DD,
+                                           const FunctionDecl *ArrayDelete) {}
+
+  /// A virtual destructor's operator global array delete has been resolved.
+  virtual void
+  ResolvedOperatorGlobArrayDelete(const CXXDestructorDecl *DD,
+                                  const FunctionDecl *GlobArrayDelete) {}
+
   /// An implicit member got a definition.
   virtual void CompletedImplicitDefinition(const FunctionDecl *D) {}
 
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index dfa3befb27dd0..5c4ad3c45da19 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -2872,8 +2872,6 @@ class CXXDestructorDecl : public CXXMethodDecl {
 
   // FIXME: Don't allocate storage for these except in the first declaration
   // of a virtual destructor.
-  FunctionDecl *OperatorDelete = nullptr;
-  FunctionDecl *OperatorGlobalDelete = nullptr;
   Expr *OperatorDeleteThisArg = nullptr;
 
   CXXDestructorDecl(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
@@ -2900,14 +2898,12 @@ class CXXDestructorDecl : public CXXMethodDecl {
 
   void setOperatorDelete(FunctionDecl *OD, Expr *ThisArg);
   void setOperatorGlobalDelete(FunctionDecl *OD);
-
-  const FunctionDecl *getOperatorDelete() const {
-    return getCanonicalDecl()->OperatorDelete;
-  }
-
-  const FunctionDecl *getOperatorGlobalDelete() const {
-    return getCanonicalDecl()->OperatorGlobalDelete;
-  }
+  void setOperatorArrayDelete(FunctionDecl *OD);
+  void setGlobalOperatorArrayDelete(FunctionDecl *OD);
+  const FunctionDecl *getOperatorDelete() const;
+  const FunctionDecl *getOperatorGlobalDelete() const;
+  const FunctionDecl *getArrayOperatorDelete() const;
+  const FunctionDecl *getGlobalArrayOperatorDelete() const;
 
   Expr *getOperatorDeleteThisArg() const {
     return getCanonicalDecl()->OperatorDeleteThisArg;
diff --git a/clang/include/clang/AST/VTableBuilder.h b/clang/include/clang/AST/VTableBuilder.h
index a5de41dbc22f1..e1efe8cddcc5e 100644
--- a/clang/include/clang/AST/VTableBuilder.h
+++ b/clang/include/clang/AST/VTableBuilder.h
@@ -150,7 +150,7 @@ class VTableComponent {
 
   bool isRTTIKind() const { return isRTTIKind(getKind()); }
 
-  GlobalDecl getGlobalDecl() const {
+  GlobalDecl getGlobalDecl(bool HasVectorDeletingDtors) const {
     assert(isUsedFunctionPointerKind() &&
            "GlobalDecl can be created only from virtual function");
 
@@ -161,7 +161,9 @@ class VTableComponent {
     case CK_CompleteDtorPointer:
       return GlobalDecl(DtorDecl, CXXDtorType::Dtor_Complete);
     case CK_DeletingDtorPointer:
-      return GlobalDecl(DtorDecl, CXXDtorType::Dtor_Deleting);
+      return GlobalDecl(DtorDecl, (HasVectorDeletingDtors)
+                                      ? CXXDtorType::Dtor_VectorDeleting
+                                      : CXXDtorType::Dtor_Deleting);
     case CK_VCallOffset:
     case CK_VBaseOffset:
     case CK_OffsetToTop:
diff --git a/clang/include/clang/Basic/ABI.h b/clang/include/clang/Basic/ABI.h
index 8279529c316cf..be3edccbf50b2 100644
--- a/clang/include/clang/Basic/ABI.h
+++ b/clang/include/clang/Basic/ABI.h
@@ -32,11 +32,12 @@ enum CXXCtorType {
 
 /// C++ destructor types.
 enum CXXDtorType {
-  Dtor_Deleting, ///< Deleting dtor
-  Dtor_Complete, ///< Complete object dtor
-  Dtor_Base,     ///< Base object dtor
-  Dtor_Comdat,   ///< The COMDAT used for dtors
-  Dtor_Unified,  ///< GCC-style unified dtor
+  Dtor_Deleting,       ///< Deleting dtor
+  Dtor_Complete,       ///< Complete object dtor
+  Dtor_Base,           ///< Base object dtor
+  Dtor_Comdat,         ///< The COMDAT used for dtors
+  Dtor_Unified,        ///< GCC-style unified dtor
+  Dtor_VectorDeleting, ///< Vector deleting dtor
 };
 
 } // end namespace clang
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 39af84c8d0872..1c16f9f79ae68 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1796,6 +1796,11 @@ class TargetInfo : public TransferrableTargetInfo,
   /// destructor body.
   virtual bool callGlobalDeleteInDeletingDtor(const LangOptions &) const;
 
+  /// Controls whether to emit MSVC vector deleting destructors. The support for
+  /// vector deleting affects vtable layout and therefore is an ABI breaking
+  /// change. The support was only implemented at Clang 22 timeframe.
+  virtual bool emitVectorDeletingDtors(const LangOptions &) const;
+
   /// Controls if __builtin_longjmp / __builtin_setjmp can be lowered to
   /// llvm.eh.sjlj.longjmp / llvm.eh.sjlj.setjmp.
   virtual bool hasSjLjLowering() const {
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 163ab32fafa48..6ca182338d6af 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -8570,7 +8570,8 @@ class Sema final : public SemaBase {
   FunctionDecl *FindDeallocationFunctionForDestructor(SourceLocation StartLoc,
                                                       CXXRecordDecl *RD,
                                                       bool Diagnose,
-                                                      bool LookForGlobal);
+                                                      bool LookForGlobal,
+                                                      DeclarationName Name);
 
   /// ActOnCXXDelete - Parsed a C++ 'delete' expression (C++ 5.3.5), as in:
   /// @code ::delete ptr; @endcode
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index c77c98dffc39f..dbbfc29058f43 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -955,6 +955,10 @@ class ASTWriter : public ASTDeserializationListener,
                               Expr *ThisArg) override;
   void ResolvedOperatorGlobDelete(const CXXDestructorDecl *DD,
                                   const FunctionDecl *Delete) override;
+  void ResolvedOperatorArrayDelete(const CXXDestructorDecl *DD,
+                                   const FunctionDecl *Delete) override;
+  void ResolvedOperatorGlobArrayDelete(const CXXDestructorDecl *DD,
+                                       const FunctionDecl *Delete) override;
   void CompletedImplicitDefinition(const FunctionDecl *D) override;
   void InstantiationRequested(const ValueDecl *D) override;
   void VariableDefinitionInstantiated(const VarDecl *D) override;
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index fab907b9c1a40..72e5c3738919b 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -13333,6 +13333,71 @@ bool ASTContext::isTypeAwareOperatorNewOrDelete(const FunctionDecl *FD) const {
   return TypeAwareOperatorNewAndDeletes.contains(FD->getCanonicalDecl());
 }
 
+void ASTContext::addOperatorDeleteForVDtor(const CXXDestructorDecl *Dtor,
+                                           FunctionDecl *OperatorDelete,
+                                           OperatorDeleteKind K) const {
+  switch (K) {
+  case OperatorDeleteKind::Regular:
+    OperatorDeletesForVirtualDtor[Dtor->getCanonicalDecl()] = OperatorDelete;
+    break;
+  case OperatorDeleteKind::GlobalRegular:
+    GlobalOperatorDeletesForVirtualDtor[Dtor->getCanonicalDecl()] =
+        OperatorDelete;
+    break;
+  case OperatorDeleteKind::Array:
+    ArrayOperatorDeletesForVirtualDtor[Dtor->getCanonicalDecl()] =
+        OperatorDelete;
+    break;
+  case OperatorDeleteKind::ArrayGlobal:
+    GlobalArrayOperatorDeletesForVirtualDtor[Dtor->getCanonicalDecl()] =
+        OperatorDelete;
+    break;
+  }
+}
+
+bool ASTContext::dtorHasOperatorDelete(const CXXDestructorDecl *Dtor,
+                                       OperatorDeleteKind K) const {
+  switch (K) {
+  case OperatorDeleteKind::Regular:
+    return OperatorDeletesForVirtualDtor.contains(Dtor->getCanonicalDecl());
+  case OperatorDeleteKind::GlobalRegular:
+    return GlobalOperatorDeletesForVirtualDtor.contains(
+        Dtor->getCanonicalDecl());
+  case OperatorDeleteKind::Array:
+    return ArrayOperatorDeletesForVirtualDtor.contains(
+        Dtor->getCanonicalDecl());
+  case OperatorDeleteKind::ArrayGlobal:
+    return GlobalArrayOperatorDeletesForVirtualDtor.contains(
+        Dtor->getCanonicalDecl());
+  }
+  return false;
+}
+
+FunctionDecl *
+ASTContext::getOperatorDeleteForVDtor(const CXXDestructorDecl *Dtor,
+                                      OperatorDeleteKind K) const {
+  const CXXDestructorDecl *Canon = Dtor->getCanonicalDecl();
+  switch (K) {
+  case OperatorDeleteKind::Regular:
+    if (OperatorDeletesForVirtualDtor.contains(Canon))
+      return OperatorDeletesForVirtualDtor[Canon];
+    return nullptr;
+  case OperatorDeleteKind::GlobalRegular:
+    if (GlobalOperatorDeletesForVirtualDtor.contains(Canon))
+      return GlobalOperatorDeletesForVirtualDtor[Canon];
+    return nullptr;
+  case OperatorDeleteKind::Array:
+    if (ArrayOperatorDeletesForVirtualDtor.contains(Canon))
+      return ArrayOperatorDeletesForVirtualDtor[Canon];
+    return nullptr;
+  case OperatorDeleteKind::ArrayGlobal:
+    if (GlobalArrayOperatorDeletesForVirtualDtor.contains(Canon))
+      return GlobalArrayOperatorDeletesForVirtualDtor[Canon];
+    return nullptr;
+  }
+  return nullptr;
+}
+
 MangleNumberingContext &
 ASTContext::getManglingNumberContext(const DeclContext *DC) {
   assert(LangOpts.CPlusPlus);  // We don't need mangling numbers for plain C.
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 24e4f189cbe4a..c16b1bb7a3453 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -3110,12 +3110,15 @@ CXXDestructorDecl *CXXDestructorDecl::Create(
 }
 
 void CXXDestructorDecl::setOperatorDelete(FunctionDecl *OD, Expr *ThisArg) {
-  auto *First = cast<CXXDestructorDecl>(getFirstDecl());
-  if (OD && !First->OperatorDelete) {
-    First->OperatorDelete = OD;
-    First->OperatorDeleteThisArg = ThisArg;
+  assert(!OD || (OD->getDeclName().getCXXOverloadedOperator() == OO_Delete));
+  if (OD && !getASTContext().dtorHasOperatorDelete(
+                this, ASTContext::OperatorDeleteKind::Regular)) {
+    getASTContext().addOperatorDeleteForVDtor(
+        this, OD, ASTContext::OperatorDeleteKind::Regular);
+    getCanonicalDecl()->OperatorDeleteThisArg = ThisArg;
     if (auto *L = getASTMutationListener())
-      L->ResolvedOperatorDelete(First, OD, ThisArg);
+      L->ResolvedOperatorDelete(cast<CXXDestructorDecl>(getCanonicalDecl()), OD,
+                                ThisArg);
   }
 }
 
@@ -3127,14 +3130,63 @@ void CXXDestructorDecl::setOperatorGlobalDelete(FunctionDecl *OD) {
   assert(!OD ||
          (OD->getDeclName().getCXXOverloadedOperator() == OO_Delete &&
           OD->getDeclContext()->getRedeclContext()->isTranslationUnit()));
-  auto *Canonical = cast<CXXDestructorDecl>(getCanonicalDecl());
-  if (!Canonical->OperatorGlobalDelete) {
-    Canonical->OperatorGlobalDelete = OD;
+  if (OD && !getASTContext().dtorHasOperatorDelete(
+                this, ASTContext::OperatorDeleteKind::GlobalRegular)) {
+    getASTContext().addOperatorDeleteForVDtor(
+        this, OD, ASTContext::OperatorDeleteKind::GlobalRegular);
     if (auto *L = getASTMutationListener())
-      L->ResolvedOperatorGlobDelete(Canonical, OD);
+      L->ResolvedOperatorGlobDelete(cast<CXXDestructorDecl>(getCanonicalDecl()),
+                                    OD);
   }
 }
 
+void CXXDestructorDecl::setOperatorArrayDelete(FunctionDecl *OD) {
+  assert(!OD ||
+         (OD->getDeclName().getCXXOverloadedOperator() == OO_Array_Delete));
+  if (OD && !getASTContext().dtorHasOperatorDelete(
+                this, ASTContext::OperatorDeleteKind::Array)) {
+    getASTContext().addOperatorDeleteForVDtor(
+        this, OD, ASTContext::OperatorDeleteKind::Array);
+    if (auto *L = getASTMutationListener())
+      L->ResolvedOperatorArrayDelete(
+          cast<CXXDestructorDecl>(getCanonicalDecl()), OD);
+  }
+}
+
+void CXXDestructorDecl::setGlobalOperatorArrayDelete(FunctionDecl *OD) {
+  assert(!OD ||
+         (OD->getDeclName().getCXXOverloadedOperator() == OO_Array_Delete &&
+          OD->getDeclContext()->getRedeclContext()->isTranslationUnit()));
+  if (OD && !getASTContext().dtorHasOperatorDelete(
+                this, ASTContext::OperatorDeleteKind::ArrayGlobal)) {
+    getASTContext().addOperatorDeleteForVDtor(
+        this, OD, ASTContext::OperatorDeleteKind::ArrayGlobal);
+    if (auto *L = getASTMutationListener())
+      L->ResolvedOperatorGlobArrayDelete(
+          cast<CXXDestructorDecl>(getCanonicalDecl()), OD);
+  }
+}
+
+const FunctionDecl *CXXDestructorDecl::getOperatorDelete() const {
+  return getASTContext().getOperatorDeleteForVDtor(
+      this, ASTContext::OperatorDeleteKind::Regular);
+}
+
+const FunctionDecl *CXXDestructorDecl::getOperatorGlobalDelete() const {
+  return getASTContext().getOperatorDeleteForVDtor(
+      this, ASTContext::OperatorDeleteKind::GlobalRegular);
+}
+
+const FunctionDecl *CXXDestructorDecl::getArrayOperatorDelete() const {
+  return getASTContext().getOperatorDeleteForVDtor(
+      this, ASTContext::OperatorDeleteKind::Array);
+}
+
+const FunctionDecl *CXXDestructorDecl::getGlobalArrayOperatorDelete() const {
+  return getASTContext().getOperatorDeleteForVDtor(
+      this, ASTContext::OperatorDeleteKind::ArrayGlobal);
+}
+
 bool CXXDestructorDecl::isCalledByDelete(const FunctionDecl *OpDel) const {
   // C++20 [expr.delete]p6: If the value of the operand of the delete-
   // expression is not a null pointer value and the selected deallocation
@@ -3146,7 +3198,8 @@ bool CXXDestructorDecl::isCalledByDelete(const FunctionDecl *OpDel) const {
   // delete operator, as that destructor is never called, unless the
   // destructor is virtual (see [expr.delete]p8.1) because then the
   // selected operator depends on the dynamic type of the pointer.
-  const FunctionDecl *SelectedOperatorDelete = OpDel ? OpDel : OperatorDelete;
+  const FunctionDecl *SelectedOperatorDelete =
+      OpDel ? OpDel : getOperatorDelete();
   if (!SelectedOperatorDelete)
     return true;
 
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 340bb4b2ed6a3..1d914fa876759 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -71,6 +71,9 @@ const CXXRecordDecl *Expr::getBestDynamicClassType() const {
   if (const PointerType *PTy = DerivedType->getAs<PointerType>())
     DerivedType = PTy->getPointeeType();
 
+  while (const ArrayType *ATy = DerivedType->getAsArrayTypeUnsafe())
+    DerivedType = ATy->getElementType();
+
   if (DerivedType->isDependentType())
     return nullptr;
 
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 5572e0a7ae59c..a5bcf5c97e837 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -6040,6 +6040,8 @@ void CXXNameMangler::mangleCXXDtorType(CXXDtorType T) {
   case Dtor_Comdat:
     Out << "D5";
     break;
+  case Dtor_VectorDeleting:
+    llvm_unreachable("Itanium ABI does not use vector deleting dtors");
   }
 }
 
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index f1baf9f49384b..551aa7bf3321c 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -1492,8 +1492,9 @@ void MicrosoftCXXNameMangler::mangleCXXDtorType(CXXDtorType T) {
   // <operator-name> ::= ?_G # scalar deleting destructor
   case Dtor_Deleting: Out << "?_G"; return;
   // <operator-name> ::= ?_E # vector deleting destructor
-  // FIXME: Add a vector deleting dtor type.  It goes in the vtable, so we need
-  // it.
+  case Dtor_VectorDeleting:
+    Out << "?_E";
+    return;
   case Dtor_Comdat:
     llvm_unreachable("not expecting a COMDAT");
   case Dtor_Unified:
@@ -2913,9 +2914,12 @@ void MicrosoftCXXNameMangler::mangleFunctionType(const FunctionType *T,
   //               ::= @ # structors (they have no declared return type)
   if (IsStructor) {
     if (isa<CXXDestructorDecl>(D) && isStructorDecl(D)) {
-      // The scalar deleting destructor takes an extra int argument which is not
-      // reflected in the AST.
-      if (StructorType == Dtor_Deleting) {
+      // The deleting destructors take an extra argument of type int that
+      // indicates whether the storage for the object should be deleted and
+      // whether a single object or an array of objects is being destroyed. This
+      // extra argument is not reflected in the AST.
+      if (StructorType == Dtor_Deleting ||
+          StructorType == Dtor_VectorDeleting) {
         Out << (PointersAre64Bit ? "PEAXI@Z" : "PAXI@Z");
         return;
       }
@@ -3911,10 +3915,10 @@ void MicrosoftMangleContextImpl::mangleCXXDtorThunk(const CXXDestructorDecl *DD,
                                                     const ThunkInfo &Thunk,
                                                     bool /*ElideOverrideInfo*/,
                                                     raw_ostream &Out) {
-  // FIXME: Actually, the dtor thunk should be emitted for vector deleting
-  // dtors rather than scalar deleting dtors. Just use the vector deleting dtor
-  // mangling manually until we support both deleting dtor types.
-  assert(Type == Dtor_Deleting);
+  // The dtor thunk should use vector deleting dtor mangling, however as an
+  // optimization we may end up emitting only scalar deleting dtor body, so just
+  // use the vector deleting dtor mangling manually.
+  assert(Type == Dtor_Deleting || Type == Dtor_VectorDeleting);
   msvc_hashing_ostream MHO(Out);
   MicrosoftCXXNameMangler Mangler(*this, MHO, DD, Type);
   Mangler.getStream() << "??_E";
diff --git a/clang/lib/AST/VTableBuilder.cpp b/clang/lib/AST/VTableBuilder.cpp
index 3ded3a51206da..9951126c2c3a3 100644
--- a/clang/lib/AST/VTableBuilder.cpp
+++ b/clang/lib/AST/VTableBuilder.cpp
@@ -2658,7 +2658,12 @@ class VFTableBuilder {
       MethodVFTableLocation Loc(MI.VBTableIndex, WhichVFPtr.getVBaseWithVPtr(),
                                 WhichVFPtr.NonVirtualOffset, MI.VFTableIndex);
       if (const CXXDestructorDecl *DD = dyn_cast<CXXDestructorDecl>(MD)) {
-        MethodVFTableLocations[GlobalDecl(DD, Dtor_Deleting)] = Loc;
+        // In Microsoft ABI vftable always references vector deleting dtor.
+        CXXDtorType DtorTy = Context.getTargetInfo().emitVectorDeletingDtors(
+                                 Context.getLangOpts())
+                                 ? Dtor_VectorDeleting
+                                 : Dtor_Deleting;
+        MethodVFTableLocations[GlobalDecl(DD, DtorTy)] = Loc;
       } else {
         MethodVFTableLocations[MD] = Loc;
       }
@@ -3288,7 +3293,11 @@ void VFTableBuilder::dumpLayout(raw_ostream &Out) {
       const CXXDestructorDecl *DD = Component.getDestructorDecl();
 
       DD->printQualifiedName(Out);
-      Out << "() [scalar deleting]";
+      if (Context.getTargetInfo().emitVectorDeletingDtors(
+              Context.getLangOpts()))
+        Out << "() [vector deleting]";
+      else
+        Out << "() [scalar deleting]";
 
       if (DD->isPureVirtual())
         Out << " [pure]";
@@ -3758,7 +3767,7 @@ void MicrosoftVTableContext::dumpMethodLocations(
         PredefinedIdentKind::PrettyFunctionNoVirtual, MD);
 
     if (isa<CXXDestructorDecl>(MD)) {
-      IndicesMap[I.second] = MethodName + " [scalar deleting]";
+      IndicesMap[I.second] = MethodName + " [vector deleting]";
     } else {
       IndicesMap[I.second] = MethodName;
     }
@@ -3874,7 +3883,8 @@ MicrosoftVTableContext::getMethodVFTableLocation(GlobalDecl GD) {
   assert(hasVtableSlot(cast<CXXMethodDecl>(GD.getDecl())) &&
          "Only use this method for virtual methods or dtors");
   if (isa<CXXDestructorDecl>(GD.getDecl()))
-    assert(GD.getDtorType() == Dtor_Deleting);
+    assert(GD.getDtorType() == Dtor_VectorDeleting ||
+           GD.getDtorType() == Dtor_Deleting);
 
   GD = GD.getCanonicalDecl();
 
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 9a5db6e164f66..ffaf98bf9c366 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -636,6 +636,13 @@ bool TargetInfo::callGlobalDeleteInDeletingDtor(
   return false;
 }
 
+bool TargetInfo::emitVectorDeletingDtors(const LangOptions &LangOpts) const {
+  if (getCXXABI() == TargetCXXABI::Microsoft &&
+      LangOpts.getClangABICompat() > LangOptions::ClangABI::Ver21)
+    return true;
+  return false;
+}
+
 bool TargetInfo::areDefaultedSMFStillPOD(const LangOptions &LangOpts) const {
   return LangOpts.getClangABICompat() > LangOptions::ClangABI::Ver15;
 }
diff --git a/clang/lib/CodeGen/CGCXX.cpp b/clang/lib/CodeGen/CGCXX.cpp
index 59aeff6804b61..8ca53c1b58a9d 100644
--- a/clang/lib/CodeGen/CGCXX.cpp
+++ b/clang/lib/CodeGen/CGCXX.cpp
@@ -174,7 +174,6 @@ bool CodeGenModule::TryEmitBaseDestructorAsAlias(const CXXDestructorDecl *D) {
   // requires explicit comdat support in the IL.
   if (llvm::GlobalValue::isWeakForLinker(TargetLinkage))
     return true;
-
   // Create the alias with no name.
   auto *Alias = llvm::GlobalAlias::create(AliasValueType, 0, Linkage, "",
                                           Aliasee, &getModule());
@@ -200,6 +199,42 @@ bool CodeGenModule::TryEmitBaseDestructorAsAlias(const CXXDestructorDecl *D) {
   return false;
 }
 
+/// Emit a definition as a global alias for another definition, unconditionally.
+void CodeGenModule::EmitDefinitionAsAlias(GlobalDecl AliasDecl,
+                                          GlobalDecl TargetDecl) {
+
+  llvm::Type *AliasValueType = getTypes().GetFunctionType(AliasDecl);
+
+  StringRef MangledName = getMangledName(AliasDecl);
+  llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
+  if (Entry && !Entry->isDeclaration())
+    return;
+  auto *Aliasee = cast<llvm::GlobalValue>(GetAddrOfGlobal(TargetDecl));
+
+  // Determine the linkage type for the alias.
+  llvm::GlobalValue::LinkageTypes Linkage = getFunctionLinkage(AliasDecl);
+
+  // Create the alias with no name.
+  auto *Alias = llvm::GlobalAlias::create(AliasValueType, 0, Linkage, "",
+                                          Aliasee, &getModule());
+  // Destructors are always unnamed_addr.
+  Alias->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+
+  if (Entry) {
+    assert(Entry->getValueType() == AliasValueType &&
+           Entry->getAddressSpace() == Alias->getAddressSpace() &&
+           "declaration exists with different type");
+    Alias->takeName(Entry);
+    Entry->replaceAllUsesWith(Alias);
+    Entry->eraseFromParent();
+  } else {
+    Alias->setName(MangledName);
+  }
+
+  // Set any additional necessary attributes for the alias.
+  SetCommonAttributes(AliasDecl, Alias);
+}
+
 llvm::Function *CodeGenModule::codegenCXXStructor(GlobalDecl GD) {
   const CGFunctionInfo &FnInfo = getTypes().arrangeCXXStructorDeclaration(GD);
   auto *Fn = cast<llvm::Function>(
diff --git a/clang/lib/CodeGen/CGCXXABI.cpp b/clang/lib/CodeGen/CGCXXABI.cpp
index 30e5dc2b6cbd9..4051cacbbbc1d 100644
--- a/clang/lib/CodeGen/CGCXXABI.cpp
+++ b/clang/lib/CodeGen/CGCXXABI.cpp
@@ -268,6 +268,20 @@ void CGCXXABI::ReadArrayCookie(CodeGenFunction &CGF, Address ptr,
   numElements = readArrayCookieImpl(CGF, allocAddr, cookieSize);
 }
 
+void CGCXXABI::ReadArrayCookie(CodeGenFunction &CGF, Address ptr,
+                               QualType eltTy, llvm::Value *&numElements,
+                               llvm::Value *&allocPtr, CharUnits &cookieSize) {
+  assert(eltTy.isDestructedType());
+
+  // Derive a char* in the same address space as the pointer.
+  ptr = ptr.withElementType(CGF.Int8Ty);
+
+  cookieSize = getArrayCookieSizeImpl(eltTy);
+  Address allocAddr = CGF.Builder.CreateConstInBoundsByteGEP(ptr, -cookieSize);
+  allocPtr = allocAddr.emitRawPointer(CGF);
+  numElements = readArrayCookieImpl(CGF, allocAddr, cookieSize);
+}
+
 llvm::Value *CGCXXABI::readArrayCookieImpl(CodeGenFunction &CGF,
                                            Address ptr,
                                            CharUnits cookieSize) {
diff --git a/clang/lib/CodeGen/CGCXXABI.h b/clang/lib/CodeGen/CGCXXABI.h
index 2dd320dbda976..47090276c56b0 100644
--- a/clang/lib/CodeGen/CGCXXABI.h
+++ b/clang/lib/CodeGen/CGCXXABI.h
@@ -583,6 +583,12 @@ class CGCXXABI {
                                QualType ElementType, llvm::Value *&NumElements,
                                llvm::Value *&AllocPtr, CharUnits &CookieSize);
 
+  /// Reads the array cookie associated with the given pointer,
+  /// that should have one.
+  void ReadArrayCookie(CodeGenFunction &CGF, Address Ptr, QualType ElementType,
+                       llvm::Value *&NumElements, llvm::Value *&AllocPtr,
+                       CharUnits &CookieSize);
+
   /// Return whether the given global decl needs a VTT parameter.
   virtual bool NeedsVTTParameter(GlobalDecl GD);
 
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index f782b0cd17da4..ced175a9a8f0e 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -1442,6 +1442,95 @@ static bool CanSkipVTablePointerInitialization(CodeGenFunction &CGF,
   return true;
 }
 
+static void EmitConditionalArrayDtorCall(const CXXDestructorDecl *DD,
+                                         CodeGenFunction &CGF,
+                                         llvm::Value *ShouldDeleteCondition) {
+  Address ThisPtr = CGF.LoadCXXThisAddress();
+  llvm::BasicBlock *ScalarBB = CGF.createBasicBlock("dtor.scalar");
+  llvm::BasicBlock *callDeleteBB =
+      CGF.createBasicBlock("dtor.call_delete_after_array_destroy");
+  llvm::BasicBlock *VectorBB = CGF.createBasicBlock("dtor.vector");
+  auto *CondTy = cast<llvm::IntegerType>(ShouldDeleteCondition->getType());
+  llvm::Value *CheckTheBitForArrayDestroy = CGF.Builder.CreateAnd(
+      ShouldDeleteCondition, llvm::ConstantInt::get(CondTy, 2));
+  llvm::Value *ShouldDestroyArray =
+      CGF.Builder.CreateIsNull(CheckTheBitForArrayDestroy);
+  CGF.Builder.CreateCondBr(ShouldDestroyArray, ScalarBB, VectorBB);
+
+  CGF.EmitBlock(VectorBB);
+
+  llvm::Value *numElements = nullptr;
+  llvm::Value *allocatedPtr = nullptr;
+  CharUnits cookieSize;
+  QualType EltTy = DD->getThisType()->getPointeeType();
+  CGF.CGM.getCXXABI().ReadArrayCookie(CGF, ThisPtr, EltTy, numElements,
+                                      allocatedPtr, cookieSize);
+
+  // Destroy the elements.
+  QualType::DestructionKind dtorKind = EltTy.isDestructedType();
+
+  assert(dtorKind);
+  assert(numElements && "no element count for a type with a destructor!");
+
+  CharUnits elementSize = CGF.getContext().getTypeSizeInChars(EltTy);
+  CharUnits elementAlign =
+      ThisPtr.getAlignment().alignmentOfArrayElement(elementSize);
+
+  llvm::Value *arrayBegin = ThisPtr.emitRawPointer(CGF);
+  llvm::Value *arrayEnd = CGF.Builder.CreateInBoundsGEP(
+      ThisPtr.getElementType(), arrayBegin, numElements, "delete.end");
+
+  // We already checked that the array is not 0-length before entering vector
+  // deleting dtor.
+  CGF.emitArrayDestroy(arrayBegin, arrayEnd, EltTy, elementAlign,
+                       CGF.getDestroyer(dtorKind),
+                       /*checkZeroLength*/ false, CGF.needsEHCleanup(dtorKind));
+
+  llvm::BasicBlock *VectorBBCont = CGF.createBasicBlock("dtor.vector.cont");
+  CGF.EmitBlock(VectorBBCont);
+
+  llvm::Value *CheckTheBitForDeleteCall = CGF.Builder.CreateAnd(
+      ShouldDeleteCondition, llvm::ConstantInt::get(CondTy, 1));
+
+  llvm::Value *ShouldCallDelete =
+      CGF.Builder.CreateIsNull(CheckTheBitForDeleteCall);
+  CGF.Builder.CreateCondBr(ShouldCallDelete, CGF.ReturnBlock.getBlock(),
+                           callDeleteBB);
+  CGF.EmitBlock(callDeleteBB);
+  const CXXDestructorDecl *Dtor = cast<CXXDestructorDecl>(CGF.CurCodeDecl);
+  const CXXRecordDecl *ClassDecl = Dtor->getParent();
+  assert(Dtor->getArrayOperatorDelete());
+  if (!Dtor->getGlobalArrayOperatorDelete()) {
+    CGF.EmitDeleteCall(Dtor->getArrayOperatorDelete(), allocatedPtr,
+                       CGF.getContext().getCanonicalTagType(ClassDecl));
+  } else {
+    // If global operator[] is set, the class had its own operator delete[].
+    // In that case, check the 4th bit. If it is set, we need to call
+    // ::delete[].
+    llvm::Value *CheckTheBitForGlobDeleteCall = CGF.Builder.CreateAnd(
+        ShouldDeleteCondition, llvm::ConstantInt::get(CondTy, 4));
+
+    llvm::Value *ShouldCallGlobDelete =
+        CGF.Builder.CreateIsNull(CheckTheBitForGlobDeleteCall);
+    llvm::BasicBlock *GlobDelete =
+        CGF.createBasicBlock("dtor.call_glob_delete_after_array_destroy");
+    llvm::BasicBlock *ClassDelete =
+        CGF.createBasicBlock("dtor.call_class_delete_after_array_destroy");
+    CGF.Builder.CreateCondBr(ShouldCallGlobDelete, ClassDelete, GlobDelete);
+    CGF.EmitBlock(ClassDelete);
+    CGF.EmitDeleteCall(Dtor->getArrayOperatorDelete(), allocatedPtr,
+                       CGF.getContext().getCanonicalTagType(ClassDecl));
+    CGF.EmitBranchThroughCleanup(CGF.ReturnBlock);
+
+    CGF.EmitBlock(GlobDelete);
+    CGF.EmitDeleteCall(Dtor->getGlobalArrayOperatorDelete(), allocatedPtr,
+                       CGF.getContext().getCanonicalTagType(ClassDecl));
+  }
+
+  CGF.EmitBranchThroughCleanup(CGF.ReturnBlock);
+  CGF.EmitBlock(ScalarBB);
+}
+
 /// EmitDestructorBody - Emits the body of the current destructor.
 void CodeGenFunction::EmitDestructorBody(FunctionArgList &Args) {
   const CXXDestructorDecl *Dtor = cast<CXXDestructorDecl>(CurGD.getDecl());
@@ -1471,7 +1560,9 @@ void CodeGenFunction::EmitDestructorBody(FunctionArgList &Args) {
   // outside of the function-try-block, which means it's always
   // possible to delegate the destructor body to the complete
   // destructor.  Do so.
-  if (DtorType == Dtor_Deleting) {
+  if (DtorType == Dtor_Deleting || DtorType == Dtor_VectorDeleting) {
+    if (CXXStructorImplicitParamValue && DtorType == Dtor_VectorDeleting)
+      EmitConditionalArrayDtorCall(Dtor, *this, CXXStructorImplicitParamValue);
     RunCleanupsScope DtorEpilogue(*this);
     EnterDtorCleanups(Dtor, Dtor_Deleting);
     if (HaveInsertPoint()) {
@@ -1502,6 +1593,8 @@ void CodeGenFunction::EmitDestructorBody(FunctionArgList &Args) {
     llvm_unreachable("not expecting a unified dtor");
   case Dtor_Comdat: llvm_unreachable("not expecting a COMDAT");
   case Dtor_Deleting: llvm_unreachable("already handled deleting case");
+  case Dtor_VectorDeleting:
+    llvm_unreachable("already handled vector deleting case");
 
   case Dtor_Complete:
     assert((Body || getTarget().getCXXABI().isMicrosoft()) &&
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index bda7b7487f59b..8299d45ece76f 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -2363,7 +2363,13 @@ llvm::DISubprogram *CGDebugInfo::CreateCXXMemberFunction(
       // Emit MS ABI vftable information.  There is only one entry for the
       // deleting dtor.
       const auto *DD = dyn_cast<CXXDestructorDecl>(Method);
-      GlobalDecl GD = DD ? GlobalDecl(DD, Dtor_Deleting) : GlobalDecl(Method);
+      GlobalDecl GD =
+          DD ? GlobalDecl(
+                   DD, CGM.getContext().getTargetInfo().emitVectorDeletingDtors(
+                           CGM.getContext().getLangOpts())
+                           ? Dtor_VectorDeleting
+                           : Dtor_Deleting)
+             : GlobalDecl(Method);
       MethodVFTableLocation ML =
           CGM.getMicrosoftVTableContext().getMethodVFTableLocation(GD);
       VIndex = ML.Index;
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 14d8db32bafc6..f64cf9f8a6c2d 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1206,6 +1206,16 @@ void CodeGenFunction::EmitNewArrayInitializer(
     EmitCXXAggrConstructorCall(Ctor, NumElements, CurPtr, CCE,
                                /*NewPointerIsChecked*/true,
                                CCE->requiresZeroInitialization());
+
+    // For MSVC vector deleting destructors support we record that for the class
+    // new[] was called. We try to optimize the code size and only emit vector
+    // deleting destructors when they are required. Vector deleting destructors
+    // are required for delete[] call but MSVC triggers emission of them
+    // whenever new[] is called for an object of the class and we do the same
+    // for compatibility.
+    if (CGM.getContext().getTargetInfo().emitVectorDeletingDtors(
+            CGM.getContext().getLangOpts()))
+      CGM.requireVectorDestructorDefinition(Ctor->getParent());
     return;
   }
 
@@ -1912,10 +1922,8 @@ static void EmitDestroyingObjectDelete(CodeGenFunction &CGF,
 /// Emit the code for deleting a single object.
 /// \return \c true if we started emitting UnconditionalDeleteBlock, \c false
 /// if not.
-static bool EmitObjectDelete(CodeGenFunction &CGF,
-                             const CXXDeleteExpr *DE,
-                             Address Ptr,
-                             QualType ElementType,
+static bool EmitObjectDelete(CodeGenFunction &CGF, const CXXDeleteExpr *DE,
+                             Address Ptr, QualType ElementType,
                              llvm::BasicBlock *UnconditionalDeleteBlock) {
   // C++11 [expr.delete]p3:
   //   If the static type of the object to be deleted is different from its
@@ -2109,6 +2117,42 @@ void CodeGenFunction::EmitCXXDeleteExpr(const CXXDeleteExpr *E) {
   DeleteTy = getContext().getBaseElementType(DeleteTy);
   Ptr = Ptr.withElementType(ConvertTypeForMem(DeleteTy));
 
+  if (E->isArrayForm() &&
+      CGM.getContext().getTargetInfo().emitVectorDeletingDtors(
+          CGM.getContext().getLangOpts())) {
+    if (auto *RD = DeleteTy->getAsCXXRecordDecl()) {
+      auto *Dtor = RD->getDestructor();
+      if (Dtor && Dtor->isVirtual()) {
+        llvm::Value *NumElements = nullptr;
+        llvm::Value *AllocatedPtr = nullptr;
+        CharUnits CookieSize;
+        llvm::BasicBlock *BodyBB = createBasicBlock("vdtor.call");
+        llvm::BasicBlock *DoneBB = createBasicBlock("vdtor.nocall");
+        // Check array cookie to see if the array has length 0. Don't call
+        // the destructor in that case.
+        CGM.getCXXABI().ReadArrayCookie(*this, Ptr, E, DeleteTy, NumElements,
+                                        AllocatedPtr, CookieSize);
+
+        auto *CondTy = cast<llvm::IntegerType>(NumElements->getType());
+        llvm::Value *IsEmpty = Builder.CreateICmpEQ(
+            NumElements, llvm::ConstantInt::get(CondTy, 0));
+        Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
+
+        // Delete cookie for empty array.
+        const FunctionDecl *OperatorDelete = E->getOperatorDelete();
+        EmitBlock(DoneBB);
+        EmitDeleteCall(OperatorDelete, AllocatedPtr, DeleteTy, NumElements,
+                       CookieSize);
+        EmitBranch(DeleteEnd);
+
+        EmitBlock(BodyBB);
+        if (!EmitObjectDelete(*this, E, Ptr, DeleteTy, DeleteEnd))
+          EmitBlock(DeleteEnd);
+        return;
+      }
+    }
+  }
+
   if (E->isArrayForm()) {
     EmitArrayDelete(*this, E, Ptr, DeleteTy);
     EmitBlock(DeleteEnd);
diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp
index e14e883a55ac5..00d9f93effb32 100644
--- a/clang/lib/CodeGen/CGVTables.cpp
+++ b/clang/lib/CodeGen/CGVTables.cpp
@@ -770,7 +770,9 @@ void CodeGenVTables::addVTableComponent(ConstantArrayBuilder &builder,
   case VTableComponent::CK_FunctionPointer:
   case VTableComponent::CK_CompleteDtorPointer:
   case VTableComponent::CK_DeletingDtorPointer: {
-    GlobalDecl GD = component.getGlobalDecl();
+    GlobalDecl GD = component.getGlobalDecl(
+        CGM.getContext().getTargetInfo().emitVectorDeletingDtors(
+            CGM.getContext().getLangOpts()));
 
     const bool IsThunk =
         nextVTableThunkIndex < layout.vtable_thunks().size() &&
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 08c66bdbbb9f8..3eeb1718e455a 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -8288,3 +8288,53 @@ void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) {
 
   NewBuilder->ABI->MangleCtx = std::move(ABI->MangleCtx);
 }
+
+bool CodeGenModule::classNeedsVectorDestructor(const CXXRecordDecl *RD) {
+  if (!Context.getTargetInfo().emitVectorDeletingDtors(Context.getLangOpts()))
+    return false;
+  CXXDestructorDecl *Dtor = RD->getDestructor();
+  // The compiler can't know if new[]/delete[] will be used outside of the DLL,
+  // so just force vector deleting destructor emission if dllexport is present.
+  // This matches MSVC behavior.
+  if (Dtor && Dtor->isVirtual() && Dtor->isDefined() &&
+      Dtor->hasAttr<DLLExportAttr>())
+    return true;
+
+  return RequireVectorDeletingDtor.count(RD);
+}
+
+void CodeGenModule::requireVectorDestructorDefinition(const CXXRecordDecl *RD) {
+  if (!Context.getTargetInfo().emitVectorDeletingDtors(Context.getLangOpts()))
+    return;
+  RequireVectorDeletingDtor.insert(RD);
+
+  // To reduce code size in general case we lazily emit scalar deleting
+  // destructor definition and an alias from vector deleting destructor to
+  // scalar deleting destructor. It may happen that we first emitted the scalar
+  // deleting destructor definition and the alias and then discovered that the
+  // definition of the vector deleting destructor is required. Then we need to
+  // remove the alias and the scalar deleting destructor and queue vector
+  // deleting destructor body for emission. Check if that is the case.
+  CXXDestructorDecl *DtorD = RD->getDestructor();
+  GlobalDecl ScalarDtorGD(DtorD, Dtor_Deleting);
+  StringRef MangledName = getMangledName(ScalarDtorGD);
+  llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
+  if (Entry && !Entry->isDeclaration()) {
+    GlobalDecl VectorDtorGD(DtorD, Dtor_VectorDeleting);
+    StringRef VDName = getMangledName(VectorDtorGD);
+    llvm::GlobalValue *VDEntry = GetGlobalValue(VDName);
+    // It exists and it should be an alias.
+    assert(VDEntry && isa<llvm::GlobalAlias>(VDEntry));
+    auto *NewFn = llvm::Function::Create(
+        cast<llvm::FunctionType>(VDEntry->getValueType()),
+        llvm::Function::ExternalLinkage, VDName, &getModule());
+    SetFunctionAttributes(VectorDtorGD, NewFn, /*IsIncompleteFunction*/ false,
+                          /*IsThunk*/ false);
+    NewFn->takeName(VDEntry);
+    VDEntry->replaceAllUsesWith(NewFn);
+    VDEntry->eraseFromParent();
+    Entry->replaceAllUsesWith(NewFn);
+    Entry->eraseFromParent();
+    addDeferredDeclToEmit(VectorDtorGD);
+  }
+}
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index a253bcda2d06c..2acfc83338a0c 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -529,6 +529,9 @@ class CodeGenModule : public CodeGenTypeCache {
   /// that we don't re-emit the initializer.
   llvm::DenseMap<const Decl*, unsigned> DelayedCXXInitPosition;
 
+  /// To remember which types did require a vector deleting dtor.
+  llvm::SmallPtrSet<const CXXRecordDecl *, 16> RequireVectorDeletingDtor;
+
   typedef std::pair<OrderGlobalInitsOrStermFinalizers, llvm::Function *>
       GlobalInitData;
 
@@ -1547,6 +1550,7 @@ class CodeGenModule : public CodeGenTypeCache {
   void EmitGlobal(GlobalDecl D);
 
   bool TryEmitBaseDestructorAsAlias(const CXXDestructorDecl *D);
+  void EmitDefinitionAsAlias(GlobalDecl Alias, GlobalDecl Target);
 
   llvm::GlobalValue *GetGlobalValue(StringRef Ref);
 
@@ -1824,6 +1828,8 @@ class CodeGenModule : public CodeGenTypeCache {
     // behavior. So projects like the Linux kernel can rely on it.
     return !getLangOpts().CPlusPlus;
   }
+  void requireVectorDestructorDefinition(const CXXRecordDecl *RD);
+  bool classNeedsVectorDestructor(const CXXRecordDecl *RD);
 
   // Helper to get the alignment for a variable.
   unsigned getVtableGlobalVarAlignment(const VarDecl *D = nullptr) {
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 65c47633bc5c4..82a0acb9cd51e 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -93,6 +93,8 @@ class ItaniumCXXABI : public CodeGen::CGCXXABI {
         llvm_unreachable("emitting dtor comdat as function?");
       case Dtor_Unified:
         llvm_unreachable("emitting unified dtor as function?");
+      case Dtor_VectorDeleting:
+        llvm_unreachable("unexpected dtor kind for this ABI");
       }
       llvm_unreachable("bad dtor kind");
     }
@@ -458,7 +460,8 @@ class ItaniumCXXABI : public CodeGen::CGCXXABI {
        if (!IsInlined)
          continue;
 
-       StringRef Name = CGM.getMangledName(VtableComponent.getGlobalDecl());
+       StringRef Name = CGM.getMangledName(
+           VtableComponent.getGlobalDecl(/*HasVectorDeletingDtors=*/false));
        auto *Entry = CGM.GetGlobalValue(Name);
        // This checks if virtual inline function has already been emitted.
        // Note that it is possible that this inline function would be emitted
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index 71e24491f19a4..11ca94f03cb98 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -71,8 +71,8 @@ class MicrosoftCXXABI : public CGCXXABI {
       switch (GD.getDtorType()) {
       case Dtor_Complete:
       case Dtor_Deleting:
+      case Dtor_VectorDeleting:
         return true;
-
       case Dtor_Base:
         return false;
 
@@ -269,7 +269,11 @@ class MicrosoftCXXABI : public CGCXXABI {
 
         // There's only Dtor_Deleting in vftable but it shares the this
         // adjustment with the base one, so look up the deleting one instead.
-        LookupGD = GlobalDecl(DD, Dtor_Deleting);
+        LookupGD = GlobalDecl(
+            DD, CGM.getContext().getTargetInfo().emitVectorDeletingDtors(
+                    CGM.getContext().getLangOpts())
+                    ? Dtor_VectorDeleting
+                    : Dtor_Deleting);
       }
       MethodVFTableLocation ML =
           CGM.getMicrosoftVTableContext().getMethodVFTableLocation(LookupGD);
@@ -351,8 +355,9 @@ class MicrosoftCXXABI : public CGCXXABI {
 
   void adjustCallArgsForDestructorThunk(CodeGenFunction &CGF, GlobalDecl GD,
                                         CallArgList &CallArgs) override {
-    assert(GD.getDtorType() == Dtor_Deleting &&
-           "Only deleting destructor thunks are available in this ABI");
+    assert((GD.getDtorType() == Dtor_VectorDeleting ||
+            GD.getDtorType() == Dtor_Deleting) &&
+           "Only vector deleting destructor thunks are available in this ABI");
     CallArgs.add(RValue::get(getStructorImplicitParamValue(CGF)),
                  getContext().IntTy);
   }
@@ -1107,7 +1112,8 @@ bool MicrosoftCXXABI::HasThisReturn(GlobalDecl GD) const {
 
 static bool isDeletingDtor(GlobalDecl GD) {
   return isa<CXXDestructorDecl>(GD.getDecl()) &&
-         GD.getDtorType() == Dtor_Deleting;
+         (GD.getDtorType() == Dtor_Deleting ||
+          GD.getDtorType() == Dtor_VectorDeleting);
 }
 
 bool MicrosoftCXXABI::hasMostDerivedReturn(GlobalDecl GD) const {
@@ -1360,7 +1366,8 @@ MicrosoftCXXABI::buildStructorSignature(GlobalDecl GD,
   AddedStructorArgCounts Added;
   // TODO: 'for base' flag
   if (isa<CXXDestructorDecl>(GD.getDecl()) &&
-      GD.getDtorType() == Dtor_Deleting) {
+      (GD.getDtorType() == Dtor_Deleting ||
+       GD.getDtorType() == Dtor_VectorDeleting)) {
     // The scalar deleting destructor takes an implicit int parameter.
     ArgTys.push_back(getContext().IntTy);
     ++Added.Suffix;
@@ -1392,7 +1399,7 @@ void MicrosoftCXXABI::setCXXDestructorDLLStorage(llvm::GlobalValue *GV,
                                                  CXXDtorType DT) const {
   // Deleting destructor variants are never imported or exported. Give them the
   // default storage class.
-  if (DT == Dtor_Deleting) {
+  if (DT == Dtor_Deleting || DT == Dtor_VectorDeleting) {
     GV->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
   } else {
     const NamedDecl *ND = Dtor;
@@ -1428,6 +1435,12 @@ llvm::GlobalValue::LinkageTypes MicrosoftCXXABI::getCXXDestructorLinkage(
     return llvm::GlobalValue::LinkOnceODRLinkage;
   case Dtor_Unified:
     llvm_unreachable("MS C++ ABI does not support unified dtors");
+  case Dtor_VectorDeleting:
+    // Use the weak, non-ODR linkage for vector deleting destructors to block
+    // inlining. This enables an MS ABI code-size saving optimization that
+    // allows us to avoid emitting array deletion code when arrays of a given
+    // type are not allocated within the final linkage unit.
+    return llvm::GlobalValue::WeakAnyLinkage;
   case Dtor_Comdat:
     llvm_unreachable("MS C++ ABI does not support comdat dtors");
   }
@@ -1459,7 +1472,11 @@ MicrosoftCXXABI::getVirtualFunctionPrologueThisAdjustment(GlobalDecl GD) {
 
     // There's no Dtor_Base in vftable but it shares the this adjustment with
     // the deleting one, so look it up instead.
-    GD = GlobalDecl(DD, Dtor_Deleting);
+    GD =
+        GlobalDecl(DD, CGM.getContext().getTargetInfo().emitVectorDeletingDtors(
+                           CGM.getContext().getLangOpts())
+                           ? Dtor_VectorDeleting
+                           : Dtor_Deleting);
   }
 
   MethodVFTableLocation ML =
@@ -1508,7 +1525,11 @@ Address MicrosoftCXXABI::adjustThisArgumentForVirtualFunctionCall(
 
     // There's only Dtor_Deleting in vftable but it shares the this adjustment
     // with the base one, so look up the deleting one instead.
-    LookupGD = GlobalDecl(DD, Dtor_Deleting);
+    LookupGD =
+        GlobalDecl(DD, CGM.getContext().getTargetInfo().emitVectorDeletingDtors(
+                           CGM.getContext().getLangOpts())
+                           ? Dtor_VectorDeleting
+                           : Dtor_Deleting);
   }
   MethodVFTableLocation ML =
       CGM.getMicrosoftVTableContext().getMethodVFTableLocation(LookupGD);
@@ -2018,24 +2039,30 @@ llvm::Value *MicrosoftCXXABI::EmitVirtualDestructorCall(
   auto *CE = dyn_cast<const CXXMemberCallExpr *>(E);
   auto *D = dyn_cast<const CXXDeleteExpr *>(E);
   assert((CE != nullptr) ^ (D != nullptr));
-  assert(CE == nullptr || CE->arguments().empty());
-  assert(DtorType == Dtor_Deleting || DtorType == Dtor_Complete);
+  assert(CE == nullptr || CE->arg_begin() == CE->arg_end());
+  assert(DtorType == Dtor_VectorDeleting || DtorType == Dtor_Complete ||
+         DtorType == Dtor_Deleting);
 
   // We have only one destructor in the vftable but can get both behaviors
   // by passing an implicit int parameter.
-  GlobalDecl GD(Dtor, Dtor_Deleting);
+  ASTContext &Context = getContext();
+  bool VectorDeletingDtorsEnabled =
+      Context.getTargetInfo().emitVectorDeletingDtors(Context.getLangOpts());
+  GlobalDecl GD(Dtor, VectorDeletingDtorsEnabled ? Dtor_VectorDeleting
+                                                 : Dtor_Deleting);
   const CGFunctionInfo *FInfo =
       &CGM.getTypes().arrangeCXXStructorDeclaration(GD);
   llvm::FunctionType *Ty = CGF.CGM.getTypes().GetFunctionType(*FInfo);
   CGCallee Callee = CGCallee::forVirtual(CE, GD, This, Ty);
 
-  ASTContext &Context = getContext();
   bool IsDeleting = DtorType == Dtor_Deleting;
+  bool IsArrayDelete = D && D->isArrayForm() && VectorDeletingDtorsEnabled;
   bool IsGlobalDelete = D && D->isGlobalDelete() &&
                         Context.getTargetInfo().callGlobalDeleteInDeletingDtor(
                             Context.getLangOpts());
   llvm::Value *ImplicitParam =
-      CGF.Builder.getInt32((IsDeleting ? 1 : 0) | (IsGlobalDelete ? 4 : 0));
+      CGF.Builder.getInt32((IsDeleting ? 1 : 0) | (IsGlobalDelete ? 4 : 0) |
+                           (IsArrayDelete ? 2 : 0));
 
   QualType ThisTy;
   if (CE) {
@@ -2044,6 +2071,9 @@ llvm::Value *MicrosoftCXXABI::EmitVirtualDestructorCall(
     ThisTy = D->getDestroyedType();
   }
 
+  while (const ArrayType *ATy = Context.getAsArrayType(ThisTy))
+    ThisTy = ATy->getElementType();
+
   This = adjustThisArgumentForVirtualFunctionCall(CGF, GD, This, true);
   RValue RV =
       CGF.EmitCXXDestructorCall(GD, Callee, This.emitRawPointer(CGF), ThisTy,
@@ -4074,6 +4104,18 @@ void MicrosoftCXXABI::emitCXXStructor(GlobalDecl GD) {
   if (GD.getDtorType() == Dtor_Base && !CGM.TryEmitBaseDestructorAsAlias(dtor))
     return;
 
+  if (GD.getDtorType() == Dtor_VectorDeleting &&
+      !CGM.classNeedsVectorDestructor(dtor->getParent())) {
+    // Create GlobalDecl object with the correct type for the scalar
+    // deleting destructor.
+    GlobalDecl ScalarDtorGD(dtor, Dtor_Deleting);
+
+    // Emit an alias from the vector deleting destructor to the scalar deleting
+    // destructor.
+    CGM.EmitDefinitionAsAlias(GD, ScalarDtorGD);
+    return;
+  }
+
   llvm::Function *Fn = CGM.codegenCXXStructor(GD);
   if (Fn->isWeakForLinker())
     Fn->setComdat(CGM.getModule().getOrInsertComdat(Fn->getName()));
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 8030aac3d8771..aa36a79142e52 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -11139,9 +11139,11 @@ bool Sema::CheckDestructor(CXXDestructorDecl *Destructor) {
     else
       Loc = RD->getLocation();
 
+    DeclarationName Name =
+        Context.DeclarationNames.getCXXOperatorName(OO_Delete);
     // If we have a virtual destructor, look up the deallocation function
     if (FunctionDecl *OperatorDelete = FindDeallocationFunctionForDestructor(
-            Loc, RD, /*Diagnose=*/true, /*LookForGlobal=*/false)) {
+            Loc, RD, /*Diagnose=*/true, /*LookForGlobal=*/false, Name)) {
       Expr *ThisArg = nullptr;
 
       // If the notional 'delete this' expression requires a non-trivial
@@ -11189,9 +11191,33 @@ bool Sema::CheckDestructor(CXXDestructorDecl *Destructor) {
         // delete calls that require it.
         FunctionDecl *GlobalOperatorDelete =
             FindDeallocationFunctionForDestructor(Loc, RD, /*Diagnose*/ false,
-                                                  /*LookForGlobal*/ true);
+                                                  /*LookForGlobal*/ true, Name);
         Destructor->setOperatorGlobalDelete(GlobalOperatorDelete);
       }
+
+      if (Context.getTargetInfo().emitVectorDeletingDtors(
+              Context.getLangOpts())) {
+        // Lookup delete[] too in case we have to emit a vector deleting dtor.
+        DeclarationName VDeleteName =
+            Context.DeclarationNames.getCXXOperatorName(OO_Array_Delete);
+        FunctionDecl *ArrOperatorDelete = FindDeallocationFunctionForDestructor(
+            Loc, RD, /*Diagnose*/ false,
+            /*LookForGlobal*/ false, VDeleteName);
+        if (ArrOperatorDelete && isa<CXXMethodDecl>(ArrOperatorDelete)) {
+          FunctionDecl *GlobalArrOperatorDelete =
+              FindDeallocationFunctionForDestructor(Loc, RD, /*Diagnose*/ false,
+                                                    /*LookForGlobal*/ true,
+                                                    VDeleteName);
+          Destructor->setGlobalOperatorArrayDelete(GlobalArrOperatorDelete);
+        } else if (!ArrOperatorDelete) {
+          ArrOperatorDelete = FindDeallocationFunctionForDestructor(
+              Loc, RD, /*Diagnose*/ false,
+              /*LookForGlobal*/ true, VDeleteName);
+        }
+        assert(ArrOperatorDelete &&
+               "Should've found at least global array delete");
+        Destructor->setOperatorArrayDelete(ArrOperatorDelete);
+      }
     }
   }
 
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index fe1f89b7a5dfa..dc7ed4e9a48bc 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -3612,11 +3612,9 @@ Sema::FindUsualDeallocationFunction(SourceLocation StartLoc,
   return Result.FD;
 }
 
-FunctionDecl *Sema::FindDeallocationFunctionForDestructor(SourceLocation Loc,
-                                                          CXXRecordDecl *RD,
-                                                          bool Diagnose,
-                                                          bool LookForGlobal) {
-  DeclarationName Name = Context.DeclarationNames.getCXXOperatorName(OO_Delete);
+FunctionDecl *Sema::FindDeallocationFunctionForDestructor(
+    SourceLocation Loc, CXXRecordDecl *RD, bool Diagnose, bool LookForGlobal,
+    DeclarationName Name) {
 
   FunctionDecl *OperatorDelete = nullptr;
   CanQualType DeallocType = Context.getCanonicalTagType(RD);
@@ -3649,8 +3647,11 @@ bool Sema::FindDeallocationFunction(SourceLocation StartLoc, CXXRecordDecl *RD,
   // Try to find operator delete/operator delete[] in class scope.
   LookupQualifiedName(Found, RD);
 
-  if (Found.isAmbiguous())
+  if (Found.isAmbiguous()) {
+    if (!Diagnose)
+      Found.suppressDiagnostics();
     return true;
+  }
 
   Found.suppressDiagnostics();
 
diff --git a/clang/lib/Serialization/ASTCommon.h b/clang/lib/Serialization/ASTCommon.h
index c9b9b1bbf8743..23d3954f257e7 100644
--- a/clang/lib/Serialization/ASTCommon.h
+++ b/clang/lib/Serialization/ASTCommon.h
@@ -42,7 +42,9 @@ enum class DeclUpdateKind {
   DeclMarkedOpenMPDeclareTarget,
   DeclExported,
   AddedAttrToRecord,
-  CXXResolvedDtorGlobDelete
+  CXXResolvedDtorGlobDelete,
+  CXXResolvedDtorArrayDelete,
+  CXXResolvedDtorGlobArrayDelete
 };
 
 TypeIdx TypeIdxFromBuiltin(const BuiltinType *BT);
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 5456e73956659..0ee8c3511527c 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2339,19 +2339,33 @@ void ASTDeclReader::VisitCXXConstructorDecl(CXXConstructorDecl *D) {
 void ASTDeclReader::VisitCXXDestructorDecl(CXXDestructorDecl *D) {
   VisitCXXMethodDecl(D);
 
-  CXXDestructorDecl *Canon = D->getCanonicalDecl();
+  ASTContext &C = Reader.getContext();
+  CXXDestructorDecl *Canon = cast<CXXDestructorDecl>(D->getCanonicalDecl());
   if (auto *OperatorDelete = readDeclAs<FunctionDecl>()) {
     auto *ThisArg = Record.readExpr();
     // FIXME: Check consistency if we have an old and new operator delete.
-    if (!Canon->OperatorDelete) {
-      Canon->OperatorDelete = OperatorDelete;
+    if (!C.dtorHasOperatorDelete(D, ASTContext::OperatorDeleteKind::Regular)) {
+      C.addOperatorDeleteForVDtor(D, OperatorDelete,
+                                  ASTContext::OperatorDeleteKind::Regular);
       Canon->OperatorDeleteThisArg = ThisArg;
     }
   }
   if (auto *OperatorGlobDelete = readDeclAs<FunctionDecl>()) {
-    if (!Canon->OperatorGlobalDelete) {
-      Canon->OperatorGlobalDelete = OperatorGlobDelete;
-    }
+    if (!C.dtorHasOperatorDelete(D,
+                                 ASTContext::OperatorDeleteKind::GlobalRegular))
+      C.addOperatorDeleteForVDtor(
+          D, OperatorGlobDelete, ASTContext::OperatorDeleteKind::GlobalRegular);
+  }
+  if (auto *OperatorArrayDelete = readDeclAs<FunctionDecl>()) {
+    if (!C.dtorHasOperatorDelete(D, ASTContext::OperatorDeleteKind::Array))
+      C.addOperatorDeleteForVDtor(D, OperatorArrayDelete,
+                                  ASTContext::OperatorDeleteKind::Array);
+  }
+  if (auto *OperatorGlobArrayDelete = readDeclAs<FunctionDecl>()) {
+    if (!C.dtorHasOperatorDelete(D,
+                                 ASTContext::OperatorDeleteKind::ArrayGlobal))
+      C.addOperatorDeleteForVDtor(D, OperatorGlobArrayDelete,
+                                  ASTContext::OperatorDeleteKind::ArrayGlobal);
   }
 }
 
@@ -4852,22 +4866,48 @@ void ASTDeclReader::UpdateDecl(Decl *D) {
     case DeclUpdateKind::CXXResolvedDtorDelete: {
       // Set the 'operator delete' directly to avoid emitting another update
       // record.
+      CXXDestructorDecl *Canon = cast<CXXDestructorDecl>(D->getCanonicalDecl());
+      ASTContext &C = Reader.getContext();
       auto *Del = readDeclAs<FunctionDecl>();
-      auto *First = cast<CXXDestructorDecl>(D->getCanonicalDecl());
       auto *ThisArg = Record.readExpr();
+      auto *Dtor = cast<CXXDestructorDecl>(D);
       // FIXME: Check consistency if we have an old and new operator delete.
-      if (!First->OperatorDelete) {
-        First->OperatorDelete = Del;
-        First->OperatorDeleteThisArg = ThisArg;
+      if (!C.dtorHasOperatorDelete(Dtor,
+                                   ASTContext::OperatorDeleteKind::Regular)) {
+        C.addOperatorDeleteForVDtor(Dtor, Del,
+                                    ASTContext::OperatorDeleteKind::Regular);
+        Canon->OperatorDeleteThisArg = ThisArg;
       }
       break;
     }
 
     case DeclUpdateKind::CXXResolvedDtorGlobDelete: {
       auto *Del = readDeclAs<FunctionDecl>();
-      auto *Canon = cast<CXXDestructorDecl>(D->getCanonicalDecl());
-      if (!Canon->OperatorGlobalDelete)
-        Canon->OperatorGlobalDelete = Del;
+      auto *Dtor = cast<CXXDestructorDecl>(D);
+      ASTContext &C = Reader.getContext();
+      if (!C.dtorHasOperatorDelete(
+              Dtor, ASTContext::OperatorDeleteKind::GlobalRegular))
+        C.addOperatorDeleteForVDtor(
+            Dtor, Del, ASTContext::OperatorDeleteKind::GlobalRegular);
+      break;
+    }
+    case DeclUpdateKind::CXXResolvedDtorArrayDelete: {
+      auto *Del = readDeclAs<FunctionDecl>();
+      auto *Dtor = cast<CXXDestructorDecl>(D);
+      ASTContext &C = Reader.getContext();
+      if (!C.dtorHasOperatorDelete(Dtor, ASTContext::OperatorDeleteKind::Array))
+        C.addOperatorDeleteForVDtor(Dtor, Del,
+                                    ASTContext::OperatorDeleteKind::Array);
+      break;
+    }
+    case DeclUpdateKind::CXXResolvedDtorGlobArrayDelete: {
+      auto *Del = readDeclAs<FunctionDecl>();
+      auto *Dtor = cast<CXXDestructorDecl>(D);
+      ASTContext &C = Reader.getContext();
+      if (!C.dtorHasOperatorDelete(Dtor,
+                                   ASTContext::OperatorDeleteKind::ArrayGlobal))
+        C.addOperatorDeleteForVDtor(
+            Dtor, Del, ASTContext::OperatorDeleteKind::ArrayGlobal);
       break;
     }
 
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index e8c0d3f2b4ee9..547497cbd87d9 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6531,6 +6531,14 @@ void ASTWriter::WriteDeclUpdatesBlocks(ASTContext &Context,
         Record.AddDeclRef(Update.getDecl());
         break;
 
+      case DeclUpdateKind::CXXResolvedDtorArrayDelete:
+        Record.AddDeclRef(Update.getDecl());
+        break;
+
+      case DeclUpdateKind::CXXResolvedDtorGlobArrayDelete:
+        Record.AddDeclRef(Update.getDecl());
+        break;
+
       case DeclUpdateKind::CXXResolvedExceptionSpec: {
         auto prototype =
           cast<FunctionDecl>(D)->getType()->castAs<FunctionProtoType>();
@@ -7604,6 +7612,34 @@ void ASTWriter::ResolvedOperatorGlobDelete(const CXXDestructorDecl *DD,
   });
 }
 
+void ASTWriter::ResolvedOperatorArrayDelete(const CXXDestructorDecl *DD,
+                                            const FunctionDecl *ArrayDelete) {
+  if (Chain && Chain->isProcessingUpdateRecords())
+    return;
+  assert(!WritingAST && "Already writing the AST!");
+  assert(ArrayDelete && "Not given an operator delete");
+  if (!Chain)
+    return;
+  Chain->forEachImportedKeyDecl(DD, [&](const Decl *D) {
+    DeclUpdates[D].push_back(
+        DeclUpdate(DeclUpdateKind::CXXResolvedDtorArrayDelete, ArrayDelete));
+  });
+}
+
+void ASTWriter::ResolvedOperatorGlobArrayDelete(
+    const CXXDestructorDecl *DD, const FunctionDecl *GlobArrayDelete) {
+  if (Chain && Chain->isProcessingUpdateRecords())
+    return;
+  assert(!WritingAST && "Already writing the AST!");
+  assert(GlobArrayDelete && "Not given an operator delete");
+  if (!Chain)
+    return;
+  Chain->forEachImportedKeyDecl(DD, [&](const Decl *D) {
+    DeclUpdates[D].push_back(DeclUpdate(
+        DeclUpdateKind::CXXResolvedDtorGlobArrayDelete, GlobArrayDelete));
+  });
+}
+
 void ASTWriter::CompletedImplicitDefinition(const FunctionDecl *D) {
   if (Chain && Chain->isProcessingUpdateRecords()) return;
   assert(!WritingAST && "Already writing the AST!");
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index c9f8797ab973f..89e6d8e2acfec 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -1794,6 +1794,8 @@ void ASTDeclWriter::VisitCXXDestructorDecl(CXXDestructorDecl *D) {
   if (D->getOperatorDelete())
     Record.AddStmt(D->getOperatorDeleteThisArg());
   Record.AddDeclRef(D->getOperatorGlobalDelete());
+  Record.AddDeclRef(D->getArrayOperatorDelete());
+  Record.AddDeclRef(D->getGlobalArrayOperatorDelete());
 
   Code = serialization::DECL_CXX_DESTRUCTOR;
 }
diff --git a/clang/test/CodeGenCXX/dllexport.cpp b/clang/test/CodeGenCXX/dllexport.cpp
index dfbb2762ac85c..ef9d8131c511c 100644
--- a/clang/test/CodeGenCXX/dllexport.cpp
+++ b/clang/test/CodeGenCXX/dllexport.cpp
@@ -633,8 +633,9 @@ struct __declspec(dllexport) Y {
 };
 
 struct __declspec(dllexport) Z { virtual ~Z() {} };
-// The scalar deleting dtor does not get exported:
-// M32-DAG: define linkonce_odr dso_local x86_thiscallcc ptr @"??_GZ@@UAEPAXI@Z"
+// The deleting dtor does not get exported, but we emit body of vector deleting
+// destructor:
+// M32-DAG: define weak dso_local x86_thiscallcc ptr @"??_EZ@@UAEPAXI@Z"
 
 
 // The user-defined dtor does get exported:
diff --git a/clang/test/CodeGenCXX/microsoft-abi-extern-template.cpp b/clang/test/CodeGenCXX/microsoft-abi-extern-template.cpp
index ea12aa64ae305..67df330bc3263 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-extern-template.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-extern-template.cpp
@@ -4,7 +4,7 @@
 // own copy the vftable when emitting the available externally constructor.
 
 // CHECK: @"??_7?$Foo@H@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [
-// CHECK-SAME:   ptr @"??_G?$Foo@H@@UEAAPEAXI@Z"
+// CHECK-SAME:   ptr @"??_E?$Foo@H@@UEAAPEAXI@Z"
 // CHECK-SAME: ] }, comdat
 
 // CHECK-LABEL: define dso_local noundef ptr @"?f@@YAPEAU?$Foo@H@@XZ"()
diff --git a/clang/test/CodeGenCXX/microsoft-abi-structors.cpp b/clang/test/CodeGenCXX/microsoft-abi-structors.cpp
index 497775840e049..670988fc1ada2 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-structors.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-structors.cpp
@@ -169,7 +169,7 @@ void foo() {
 // DTORS2-LABEL: define linkonce_odr dso_local x86_thiscallcc ptr @"??_EC@dtor_in_second_nvbase@@W3AEPAXI@Z"(ptr %this, i32 %should_call_delete)
 //      Do an adjustment from B* to C*.
 // DTORS2:   getelementptr i8, ptr %{{.*}}, i32 -4
-// DTORS2:   %[[CALL:.*]] = tail call x86_thiscallcc ptr @"??_GC@dtor_in_second_nvbase@@UAEPAXI@Z"
+// DTORS2:   %[[CALL:.*]] = tail call x86_thiscallcc ptr @"??_EC@dtor_in_second_nvbase@@UAEPAXI@Z"
 // DTORS2:   ret ptr %[[CALL]]
 }
 
diff --git a/clang/test/CodeGenCXX/microsoft-abi-thunks.cpp b/clang/test/CodeGenCXX/microsoft-abi-thunks.cpp
index 38aa81253ccad..83ec158ff7f51 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-thunks.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-thunks.cpp
@@ -63,8 +63,7 @@ C::C() {}  // Emits vftable and forces thunk generation.
 
 // CODEGEN-LABEL: define linkonce_odr dso_local x86_thiscallcc noundef ptr @"??_EC@@W3AEPAXI@Z"(ptr noundef %this, i32 noundef %should_call_delete) {{.*}} comdat
 // CODEGEN:   getelementptr i8, ptr {{.*}}, i32 -4
-// FIXME: should actually call _EC, not _GC.
-// CODEGEN:   call x86_thiscallcc noundef ptr @"??_GC@@UAEPAXI@Z"
+// CODEGEN:   call x86_thiscallcc noundef ptr @"??_EC@@UAEPAXI@Z"
 // CODEGEN: ret
 
 // CODEGEN-LABEL: define linkonce_odr dso_local x86_thiscallcc void @"?public_f@C@@W3AEXXZ"(ptr
diff --git a/clang/test/CodeGenCXX/microsoft-abi-vftables.cpp b/clang/test/CodeGenCXX/microsoft-abi-vftables.cpp
index bc278bdb847fc..7ceb15e40e582 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-vftables.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-vftables.cpp
@@ -8,38 +8,38 @@ struct S {
   virtual ~S();
 } s;
 
-// RTTI-DAG: [[VTABLE_S:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4S@@6B@", ptr @"??_GS@@UAEPAXI@Z"] }, comdat($"??_7S@@6B@")
+// RTTI-DAG: [[VTABLE_S:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4S@@6B@", ptr @"??_ES@@UAEPAXI@Z"] }, comdat($"??_7S@@6B@")
 // RTTI-DAG: @"??_7S@@6B@" = unnamed_addr alias ptr, getelementptr inbounds ({ [2 x ptr] }, ptr [[VTABLE_S]], i32 0, i32 0, i32 1)
 
-// NO-RTTI-DAG: @"??_7S@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_GS@@UAEPAXI@Z"] }
+// NO-RTTI-DAG: @"??_7S@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_ES@@UAEPAXI@Z"] }
 
 struct __declspec(dllimport) U {
   virtual ~U();
 } u;
 
-// RTTI-DAG: [[VTABLE_U:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4U@@6B@", ptr @"??_GU@@UAEPAXI@Z"] }
+// RTTI-DAG: [[VTABLE_U:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4U@@6B@", ptr @"??_EU@@UAEPAXI@Z"] }
 // RTTI-DAG: @"??_SU@@6B@" = unnamed_addr alias ptr, getelementptr inbounds ({ [2 x ptr] }, ptr [[VTABLE_U]], i32 0, i32 0, i32 1)
 
-// NO-RTTI-DAG: @"??_SU@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_GU@@UAEPAXI@Z"] }
+// NO-RTTI-DAG: @"??_SU@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_EU@@UAEPAXI@Z"] }
 
 struct __declspec(dllexport) V {
   virtual ~V();
 } v;
 
-// RTTI-DAG: [[VTABLE_V:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4V@@6B@", ptr @"??_GV@@UAEPAXI@Z"] }, comdat($"??_7V@@6B@")
+// RTTI-DAG: [[VTABLE_V:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4V@@6B@", ptr @"??_EV@@UAEPAXI@Z"] }, comdat($"??_7V@@6B@")
 // RTTI-DAG: @"??_7V@@6B@" = dllexport unnamed_addr alias ptr, getelementptr inbounds ({ [2 x ptr] }, ptr [[VTABLE_V]], i32 0, i32 0, i32 1)
 
-// NO-RTTI-DAG: @"??_7V@@6B@" = weak_odr dllexport unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_GV@@UAEPAXI@Z"] }
+// NO-RTTI-DAG: @"??_7V@@6B@" = weak_odr dllexport unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_EV@@UAEPAXI@Z"] }
 
 namespace {
 struct W {
   virtual ~W() {}
 } w;
 }
-// RTTI-DAG: [[VTABLE_W:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4W@?A0x{{[^@]*}}@@6B@", ptr @"??_GW@?A0x{{[^@]*}}@@UAEPAXI@Z"] }
+// RTTI-DAG: [[VTABLE_W:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4W@?A0x{{[^@]*}}@@6B@", ptr @"??_EW@?A0x{{[^@]*}}@@UAEPAXI@Z"] }
 // RTTI-DAG: @"??_7W@?A0x{{[^@]*}}@@6B@" = internal unnamed_addr alias ptr, getelementptr inbounds ({ [2 x ptr] }, ptr [[VTABLE_W]], i32 0, i32 0, i32 1)
 
-// NO-RTTI-DAG: @"??_7W@?A0x{{[^@]*}}@@6B@" = internal unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_GW@?A0x{{[^@]*}}@@UAEPAXI@Z"] }
+// NO-RTTI-DAG: @"??_7W@?A0x{{[^@]*}}@@6B@" = internal unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_EW@?A0x{{[^@]*}}@@UAEPAXI@Z"] }
 
 struct X {};
 template <class> struct Y : virtual X {
@@ -49,7 +49,7 @@ template <class> struct Y : virtual X {
 
 extern template class Y<int>;
 template Y<int>::Y();
-// RTTI-DAG: [[VTABLE_Y:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4?$Y@H@@6B@", ptr @"??_G?$Y@H@@UAEPAXI@Z"] }, comdat($"??_7?$Y@H@@6B@")
+// RTTI-DAG: [[VTABLE_Y:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4?$Y@H@@6B@", ptr @"??_E?$Y@H@@UAEPAXI@Z"] }, comdat($"??_7?$Y@H@@6B@")
 // RTTI-DAG: @"??_7?$Y@H@@6B@" = unnamed_addr alias ptr, getelementptr inbounds ({ [2 x ptr] }, ptr [[VTABLE_Y]], i32 0, i32 0, i32 1)
 
-// NO-RTTI-DAG: @"??_7?$Y@H@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_G?$Y@H@@UAEPAXI@Z"] }, comdat
+// NO-RTTI-DAG: @"??_7?$Y@H@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_E?$Y@H@@UAEPAXI@Z"] }, comdat
diff --git a/clang/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp b/clang/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp
index b54775f6c5dd0..7e9dce18b2797 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp
@@ -80,6 +80,15 @@ B::~B() {
   // CHECK2: call x86_thiscallcc void @"??1VBase@@UAE@XZ"(ptr {{[^,]*}} %[[VBASE_i8]])
   // CHECK2: ret
 
+  // CHECK2-LABEL: define linkonce_odr dso_local x86_thiscallcc noundef ptr @"??0B@test2@@QAE@XZ"
+  // CHECK2:           (ptr {{[^,]*}} returned align 4 dereferenceable(4) %this, i32 noundef %is_most_derived)
+  // CHECK2: call x86_thiscallcc noundef ptr @"??0A@test2@@QAE@XZ"(ptr {{[^,]*}} %{{.*}})
+  // CHECK2: ret
+
+  // CHECK2-LABEL: define linkonce_odr dso_local x86_thiscallcc noundef ptr @"??_GD@pr36921@@UAEPAXI@Z"(
+  // CHECK2:   %[[THIS_RELOAD:.*]] = load ptr, ptr
+  // CHECK2:   %[[THIS_ADJ_i8:.*]] = getelementptr inbounds i8, ptr %[[THIS_RELOAD]], i32 -4
+
   // CHECK2-LABEL: define linkonce_odr dso_local x86_thiscallcc noundef ptr @"??_GB@@UAEPAXI@Z"
   // CHECK2:   store ptr %{{.*}}, ptr %[[THIS_ADDR:.*]], align 4
   // CHECK2:   %[[THIS_i8:.*]] = getelementptr inbounds i8, ptr %[[THIS_PARAM_i8:.*]], i32 -8
@@ -293,11 +302,6 @@ void callC() { C x; }
 // CHECK: call x86_thiscallcc noundef ptr @"??0A@test2@@QAE@XZ"(ptr {{[^,]*}} %{{.*}})
 // CHECK: ret
 
-// CHECK2-LABEL: define linkonce_odr dso_local x86_thiscallcc noundef ptr @"??0B@test2@@QAE@XZ"
-// CHECK2:           (ptr {{[^,]*}} returned align 4 dereferenceable(4) %this, i32 noundef %is_most_derived)
-// CHECK2: call x86_thiscallcc noundef ptr @"??0A@test2@@QAE@XZ"(ptr {{[^,]*}} %{{.*}})
-// CHECK2: ret
-
 }
 
 namespace test3 {
@@ -480,9 +484,6 @@ struct B {
 struct C : virtual B {};
 struct D : virtual A, C {};
 D d;
-// CHECK2-LABEL: define linkonce_odr dso_local x86_thiscallcc noundef ptr @"??_GD@pr36921@@UAEPAXI@Z"(
-// CHECK2:   %[[THIS_RELOAD:.*]] = load ptr, ptr
-// CHECK2:   %[[THIS_ADJ_i8:.*]] = getelementptr inbounds i8, ptr %[[THIS_RELOAD]], i32 -4
 }
 
 namespace issue_60465 {
diff --git a/clang/test/CodeGenCXX/microsoft-abi-vtables-multiple-nonvirtual-inheritance-vdtors.cpp b/clang/test/CodeGenCXX/microsoft-abi-vtables-multiple-nonvirtual-inheritance-vdtors.cpp
index a407766f8ed9f..74150b0ecb535 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-vtables-multiple-nonvirtual-inheritance-vdtors.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-vtables-multiple-nonvirtual-inheritance-vdtors.cpp
@@ -12,18 +12,18 @@ struct B {
 
 struct C : A, B {
   // CHECK-LABEL: VFTable for 'A' in 'C' (2 entries).
-  // CHECK-NEXT:   0 | C::~C() [scalar deleting]
+  // CHECK-NEXT:   0 | C::~C() [vector deleting]
   // CHECK-NEXT:   1 | void A::z1()
 
   // CHECK-LABEL: VFTable for 'B' in 'C' (1 entry).
-  // CHECK-NEXT:   0 | C::~C() [scalar deleting]
+  // CHECK-NEXT:   0 | C::~C() [vector deleting]
   // CHECK-NEXT:       [this adjustment: -4 non-virtual]
 
   // CHECK-LABEL: Thunks for 'C::~C()' (1 entry).
   // CHECK-NEXT:   0 | [this adjustment: -4 non-virtual]
 
   // CHECK-LABEL: VFTable indices for 'C' (1 entry).
-  // CHECK-NEXT:   0 | C::~C() [scalar deleting]
+  // CHECK-NEXT:   0 | C::~C() [vector deleting]
   virtual ~C();
 };
 
@@ -41,7 +41,7 @@ struct E : D, B {
   // CHECK-NEXT:   0 | void D::z4()
 
   // CHECK-LABEL: VFTable for 'B' in 'E' (1 entry).
-  // CHECK-NEXT:   0 | E::~E() [scalar deleting]
+  // CHECK-NEXT:   0 | E::~E() [vector deleting]
   // CHECK-NEXT:       [this adjustment: -4 non-virtual]
 
   // CHECK-LABEL: Thunks for 'E::~E()' (1 entry).
@@ -49,7 +49,7 @@ struct E : D, B {
 
   // CHECK-LABEL: VFTable indices for 'E' (1 entry).
   // CHECK-NEXT:   -- accessible via vfptr at offset 4 --
-  // CHECK-NEXT:   0 | E::~E() [scalar deleting]
+  // CHECK-NEXT:   0 | E::~E() [vector deleting]
 };
 
 void build_vftable(E *obj) { delete obj; }
@@ -61,7 +61,7 @@ struct F : D, B {
   // CHECK-NEXT:   0 | void D::z4()
 
   // CHECK-LABEL: VFTable for 'B' in 'F' (1 entry).
-  // CHECK-NEXT:   0 | F::~F() [scalar deleting]
+  // CHECK-NEXT:   0 | F::~F() [vector deleting]
   // CHECK-NEXT:       [this adjustment: -4 non-virtual]
 
   // CHECK-LABEL: Thunks for 'F::~F()' (1 entry).
@@ -69,7 +69,7 @@ struct F : D, B {
 
   // CHECK-LABEL: VFTable indices for 'F' (1 entry).
   // CHECK-NEXT:   -- accessible via vfptr at offset 4 --
-  // CHECK-NEXT:   0 | F::~F() [scalar deleting]
+  // CHECK-NEXT:   0 | F::~F() [vector deleting]
 };
 
 void build_vftable(F *obj) { delete obj; }
@@ -79,7 +79,7 @@ struct G : F {
   // CHECK-NEXT:   0 | void D::z4()
 
   // CHECK-LABEL: VFTable for 'B' in 'F' in 'G' (1 entry).
-  // CHECK-NEXT:   0 | G::~G() [scalar deleting]
+  // CHECK-NEXT:   0 | G::~G() [vector deleting]
   // CHECK-NEXT:       [this adjustment: -4 non-virtual]
 
   // CHECK-LABEL: Thunks for 'G::~G()' (1 entry).
@@ -87,7 +87,7 @@ struct G : F {
 
   // CHECK-LABEL: VFTable indices for 'G' (1 entry).
   // CHECK-NEXT:   -- accessible via vfptr at offset 4 --
-  // CHECK-NEXT:   0 | G::~G() [scalar deleting]
+  // CHECK-NEXT:   0 | G::~G() [vector deleting]
   virtual ~G();
 };
 
diff --git a/clang/test/CodeGenCXX/microsoft-abi-vtables-return-thunks.cpp b/clang/test/CodeGenCXX/microsoft-abi-vtables-return-thunks.cpp
index 5030a5dcd2a50..1a589370d3a74 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-vtables-return-thunks.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-vtables-return-thunks.cpp
@@ -213,6 +213,6 @@ struct C : virtual B { C *f(); };
 C c;
 // VFTABLES-LABEL: VFTable indices for 'pr34302::C' (2 entries).
 // VFTABLES-NEXT:  -- accessible via vbtable index 1, vfptr at offset 0 --
-// VFTABLES-NEXT:    0 | pr34302::C::~C() [scalar deleting]
+// VFTABLES-NEXT:    0 | pr34302::C::~C() [vector deleting]
 // VFTABLES-NEXT:    2 | C *pr34302::C::f()
 }
diff --git a/clang/test/CodeGenCXX/microsoft-abi-vtables-single-inheritance.cpp b/clang/test/CodeGenCXX/microsoft-abi-vtables-single-inheritance.cpp
index b0bf927d38f7c..c95202e8cc253 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-vtables-single-inheritance.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-vtables-single-inheritance.cpp
@@ -44,10 +44,10 @@ void use(B *obj) { obj->f(); }
 
 struct C {
   // CHECK-LABEL: VFTable for 'C' (2 entries)
-  // CHECK-NEXT: 0 | C::~C() [scalar deleting]
+  // CHECK-NEXT: 0 | C::~C() [vector deleting]
   // CHECK-NEXT: 1 | void C::f()
   // CHECK-LABEL: VFTable indices for 'C' (2 entries).
-  // CHECK-NEXT: 0 | C::~C() [scalar deleting]
+  // CHECK-NEXT: 0 | C::~C() [vector deleting]
   // CHECK-NEXT: 1 | void C::f()
 
   virtual ~C();
@@ -60,10 +60,10 @@ void use(C *obj) { obj->f(); }
 struct D {
   // CHECK-LABEL: VFTable for 'D' (2 entries)
   // CHECK-NEXT: 0 | void D::f()
-  // CHECK-NEXT: 1 | D::~D() [scalar deleting]
+  // CHECK-NEXT: 1 | D::~D() [vector deleting]
   // CHECK-LABEL: VFTable indices for 'D' (2 entries)
   // CHECK-NEXT: 0 | void D::f()
-  // CHECK-NEXT: 1 | D::~D() [scalar deleting]
+  // CHECK-NEXT: 1 | D::~D() [vector deleting]
 
   virtual void f();
   virtual ~D();
@@ -77,10 +77,10 @@ struct E : A {
   // CHECK-NEXT: 0 | void A::f()
   // CHECK-NEXT: 1 | void A::g()
   // CHECK-NEXT: 2 | void A::h()
-  // CHECK-NEXT: 3 | E::~E() [scalar deleting]
+  // CHECK-NEXT: 3 | E::~E() [vector deleting]
   // CHECK-NEXT: 4 | void E::i()
   // CHECK-LABEL: VFTable indices for 'E' (2 entries).
-  // CHECK-NEXT: 3 | E::~E() [scalar deleting]
+  // CHECK-NEXT: 3 | E::~E() [vector deleting]
   // CHECK-NEXT: 4 | void E::i()
 
   // ~E would be the key method, but it isn't used, and MS ABI has no key
@@ -98,10 +98,10 @@ struct F : A {
   // CHECK-NEXT: 1 | void A::g()
   // CHECK-NEXT: 2 | void A::h()
   // CHECK-NEXT: 3 | void F::i()
-  // CHECK-NEXT: 4 | F::~F() [scalar deleting]
+  // CHECK-NEXT: 4 | F::~F() [vector deleting]
   // CHECK-LABEL: VFTable indices for 'F' (2 entries).
   // CHECK-NEXT: 3 | void F::i()
-  // CHECK-NEXT: 4 | F::~F() [scalar deleting]
+  // CHECK-NEXT: 4 | F::~F() [vector deleting]
 
   virtual void i();
   virtual ~F();
@@ -115,12 +115,12 @@ struct G : E {
   // CHECK-NEXT: 0 | void G::f()
   // CHECK-NEXT: 1 | void A::g()
   // CHECK-NEXT: 2 | void A::h()
-  // CHECK-NEXT: 3 | G::~G() [scalar deleting]
+  // CHECK-NEXT: 3 | G::~G() [vector deleting]
   // CHECK-NEXT: 4 | void E::i()
   // CHECK-NEXT: 5 | void G::j()
   // CHECK-LABEL: VFTable indices for 'G' (3 entries).
   // CHECK-NEXT: 0 | void G::f()
-  // CHECK-NEXT: 3 | G::~G() [scalar deleting]
+  // CHECK-NEXT: 3 | G::~G() [vector deleting]
   // CHECK-NEXT: 5 | void G::j()
 
   virtual void f();  // overrides A::f()
diff --git a/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance-vtordisps.cpp b/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance-vtordisps.cpp
index c5ce69f5cbcac..be9f281560dcf 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance-vtordisps.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance-vtordisps.cpp
@@ -57,7 +57,7 @@ struct A : virtual V1 {
   // CHECK-LABEL: VFTable for 'V1' in 'simple::A' (2 entries).
   // CHECK-NEXT: 0 | void simple::A::f()
   // CHECK-NEXT:     [this adjustment: vtordisp at -4, 0 non-virtual]
-  // CHECK-NEXT: 1 | simple::A::~A() [scalar deleting]
+  // CHECK-NEXT: 1 | simple::A::~A() [vector deleting]
   // CHECK-NEXT:     [this adjustment: vtordisp at -4, 0 non-virtual]
 
   // CHECK-LABEL: Thunks for 'simple::A::~A()' (1 entry).
@@ -79,7 +79,7 @@ void use(A *obj) { obj->f(); }
 struct B : virtual V3 {
   // CHECK-LABEL: VFTable for 'Z' in 'V3' in 'simple::B' (2 entries).
   // CHECK-NEXT: 0 | void Z::g()
-  // CHECK-NEXT: 1 | simple::B::~B() [scalar deleting]
+  // CHECK-NEXT: 1 | simple::B::~B() [vector deleting]
   // CHECK-NEXT:     [this adjustment: vtordisp at -4, 0 non-virtual]
 
   // CHECK-LABEL: Thunks for 'simple::B::~B()' (1 entry).
@@ -88,7 +88,7 @@ struct B : virtual V3 {
   // CHECK-LABEL: VFTable for 'V2' in 'V3' in 'simple::B' (2 entries).
   // CHECK-NEXT: 0 | void simple::B::f()
   // CHECK-NEXT:     [this adjustment: vtordisp at -12, 0 non-virtual]
-  // CHECK-NEXT: 1 | simple::B::~B() [scalar deleting]
+  // CHECK-NEXT: 1 | simple::B::~B() [vector deleting]
   // CHECK-NEXT:     [this adjustment: vtordisp at -12, -8 non-virtual]
 
   // CHECK-LABEL: Thunks for 'simple::B::~B()' (1 entry).
@@ -115,7 +115,7 @@ void use(B *obj) { obj->f(); }
 struct C : virtual V4 {
   // CHECK-LABEL: VFTable for 'Z' in 'V4' in 'simple::C' (2 entries).
   // CHECK-NEXT: 0 | void Z::g()
-  // CHECK-NEXT: 1 | simple::C::~C() [scalar deleting]
+  // CHECK-NEXT: 1 | simple::C::~C() [vector deleting]
   // CHECK-NEXT:     [this adjustment: vtordisp at -4, 0 non-virtual]
 
   // CHECK-LABEL: Thunks for 'simple::C::~C()' (1 entry).
@@ -124,7 +124,7 @@ struct C : virtual V4 {
   // CHECK-LABEL: VFTable for 'V1' in 'V4' in 'simple::C' (2 entries).
   // CHECK-NEXT: 0 | void simple::C::f()
   // CHECK-NEXT:     [this adjustment: vtordisp at -12, 0 non-virtual]
-  // CHECK-NEXT: 1 | simple::C::~C() [scalar deleting]
+  // CHECK-NEXT: 1 | simple::C::~C() [vector deleting]
   // CHECK-NEXT:     [this adjustment: vtordisp at -12, -8 non-virtual]
 
   // CHECK-LABEL: Thunks for 'simple::C::~C()' (1 entry).
@@ -136,7 +136,7 @@ struct C : virtual V4 {
   // CHECK-LABEL: VFTable for 'V2' in 'V4' in 'simple::C' (2 entries).
   // CHECK-NEXT: 0 | void simple::C::f()
   // CHECK-NEXT:     [this adjustment: vtordisp at -16, -4 non-virtual]
-  // CHECK-NEXT: 1 | simple::C::~C() [scalar deleting]
+  // CHECK-NEXT: 1 | simple::C::~C() [vector deleting]
   // CHECK-NEXT:     [this adjustment: vtordisp at -16, -12 non-virtual]
 
   // CHECK-LABEL: Thunks for 'simple::C::~C()' (1 entry).
@@ -162,7 +162,7 @@ class D : B {
   // CHECK-LABEL: VFTable for 'V2' in 'V3' in 'simple::B' in 'simple::D' (2 entries).
   // CHECK-NEXT: 0 | void simple::B::f()
   // CHECK-NEXT:     [this adjustment: vtordisp at -12, -4 non-virtual]
-  // CHECK-NEXT: 1 | simple::D::~D() [scalar deleting]
+  // CHECK-NEXT: 1 | simple::D::~D() [vector deleting]
   // CHECK-NEXT:     [this adjustment: vtordisp at -12, -8 non-virtual]
   D();
   int z;
@@ -180,12 +180,12 @@ struct F : virtual E {
   // CHECK-LABEL: VFTable for 'Z' in 'V3' in 'simple::E' in 'simple::F' (2 entries).
   // CHECK-NEXT:   0 | void simple::F::g()
   // CHECK-NEXT:       [this adjustment: vtordisp at -4, 0 non-virtual]
-  // CHECK-NEXT:   1 | simple::F::~F() [scalar deleting]
+  // CHECK-NEXT:   1 | simple::F::~F() [vector deleting]
   // CHECK-NEXT:       [this adjustment: vtordisp at -4, 0 non-virtual]
 
   // CHECK-LABEL: VFTable for 'V2' in 'V3' in 'simple::E' in 'simple::F' (2 entries).
   // CHECK-NEXT:   0 | void simple::E::f()
-  // CHECK-NEXT:   1 | simple::F::~F() [scalar deleting]
+  // CHECK-NEXT:   1 | simple::F::~F() [vector deleting]
   // CHECK-NEXT:       [this adjustment: vtordisp at -12, -8 non-virtual]
 
   F();
@@ -202,12 +202,12 @@ struct G : F {
   // CHECK-LABEL: VFTable for 'Z' in 'V3' in 'simple::E' in 'simple::F' in 'simple::G' (2 entries).
   // CHECK-NEXT:   0 | void simple::F::g()
   // CHECK-NEXT:       [this adjustment: vtordisp at -4, -4 non-virtual]
-  // CHECK-NEXT:   1 | simple::G::~G() [scalar deleting]
+  // CHECK-NEXT:   1 | simple::G::~G() [vector deleting]
   // CHECK-NEXT:       [this adjustment: vtordisp at -4, 0 non-virtual]
 
   // CHECK-LABEL: VFTable for 'V2' in 'V3' in 'simple::E' in 'simple::F' in 'simple::G' (2 entries).
   // CHECK-NEXT:   0 | void simple::E::f()
-  // CHECK-NEXT:   1 | simple::G::~G() [scalar deleting]
+  // CHECK-NEXT:   1 | simple::G::~G() [vector deleting]
   // CHECK-NEXT:       [this adjustment: vtordisp at -12, -8 non-virtual]
 
   G();
@@ -240,7 +240,7 @@ struct A : virtual simple::A {
   // CHECK-NEXT: 0 | void simple::A::f()
   // CHECK-NEXT:     [this adjustment: vtordisp at -4, vbptr at 8 to the left,
   // CHECK-NEXT:      vboffset at 8 in the vbtable, 8 non-virtual]
-  // CHECK-NEXT: 1 | extended::A::~A() [scalar deleting]
+  // CHECK-NEXT: 1 | extended::A::~A() [vector deleting]
   // CHECK-NEXT:     [this adjustment: vtordisp at -4, 0 non-virtual]
 
   // CHECK-LABEL: Thunks for 'void simple::A::f()' (1 entry).
@@ -265,7 +265,7 @@ struct B : virtual simple::A {
 
   // CHECK-LABEL: VFTable for 'V1' in 'simple::A' in 'extended::B' (2 entries).
   //  ...
-  // CHECK: 1 | extended::B::~B() [scalar deleting]
+  // CHECK: 1 | extended::B::~B() [vector deleting]
   // CHECK-NEXT: [this adjustment: vtordisp at -4, 0 non-virtual]
 
   // CHECK-LABEL: Thunks for 'void simple::A::f()' (1 entry).
@@ -353,7 +353,7 @@ struct G : virtual simple::A {
   // CHECK-NEXT: 0 | void simple::A::f()
   // CHECK-NEXT:     [this adjustment: vtordisp at -4, vbptr at 8 to the left,
   // CHECK-NEXT:      vboffset at 8 in the vbtable, 8 non-virtual]
-  // CHECK-NEXT: 1 | extended::G::~G() [scalar deleting]
+  // CHECK-NEXT: 1 | extended::G::~G() [vector deleting]
   // CHECK-NEXT:     [this adjustment: vtordisp at -4, 0 non-virtual]
 
   // CHECK-LABEL: Thunks for 'void simple::A::f()' (1 entry).
@@ -374,7 +374,7 @@ void use(G *obj) { obj->g(); }
 struct H : Z, A {
   // CHECK-LABEL: VFTable for 'Z' in 'extended::H' (2 entries).
   // CHECK-NEXT: 0 | void Z::g()
-  // CHECK-NEXT: 1 | extended::H::~H() [scalar deleting]
+  // CHECK-NEXT: 1 | extended::H::~H() [vector deleting]
 
   // CHECK-LABEL: VFTable for 'V1' in 'simple::A' in 'extended::A' in 'extended::H' (2 entries).
   // CHECK-NEXT: 0 | void simple::A::f()
diff --git a/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance.cpp b/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance.cpp
index 257ba270291c8..e5e6ea5f42c1c 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance.cpp
@@ -492,7 +492,7 @@ struct X {
 
 struct Y : virtual X {
   // CHECK-LABEL: VFTable for 'vdtors::X' in 'vdtors::Y' (2 entries).
-  // CHECK-NEXT: 0 | vdtors::Y::~Y() [scalar deleting]
+  // CHECK-NEXT: 0 | vdtors::Y::~Y() [vector deleting]
   // CHECK-NEXT: 1 | void vdtors::X::zzz()
 
   // CHECK-NOT: Thunks for 'vdtors::Y::~Y()'
@@ -515,7 +515,7 @@ struct U : virtual W {
   // CHECK-NEXT: 0 | void vdtors::Z::z()
 
   // CHECK-LABEL: VFTable for 'vdtors::X' in 'vdtors::W' in 'vdtors::U' (2 entries).
-  // CHECK-NEXT: 0 | vdtors::U::~U() [scalar deleting]
+  // CHECK-NEXT: 0 | vdtors::U::~U() [vector deleting]
   // CHECK-NEXT:     [this adjustment: -4 non-virtual]
   // CHECK-NEXT: 1 | void vdtors::X::zzz()
 
@@ -524,7 +524,7 @@ struct U : virtual W {
 
   // CHECK-LABEL: VFTable indices for 'vdtors::U' (1 entry).
   // CHECK-NEXT: -- accessible via vbtable index 1, vfptr at offset 4 --
-  // CHECK-NEXT: 0 | vdtors::U::~U() [scalar deleting]
+  // CHECK-NEXT: 0 | vdtors::U::~U() [vector deleting]
   virtual ~U();
 };
 
@@ -536,7 +536,7 @@ struct V : virtual W {
   // CHECK-NEXT: 0 | void vdtors::Z::z()
 
   // CHECK-LABEL: VFTable for 'vdtors::X' in 'vdtors::W' in 'vdtors::V' (2 entries).
-  // CHECK-NEXT: 0 | vdtors::V::~V() [scalar deleting]
+  // CHECK-NEXT: 0 | vdtors::V::~V() [vector deleting]
   // CHECK-NEXT:     [this adjustment: -4 non-virtual]
   // CHECK-NEXT: 1 | void vdtors::X::zzz()
 
@@ -545,7 +545,7 @@ struct V : virtual W {
 
   // CHECK-LABEL: VFTable indices for 'vdtors::V' (1 entry).
   // CHECK-NEXT: -- accessible via vbtable index 1, vfptr at offset 4 --
-  // CHECK-NEXT: 0 | vdtors::V::~V() [scalar deleting]
+  // CHECK-NEXT: 0 | vdtors::V::~V() [vector deleting]
 };
 
 V v;
@@ -557,7 +557,7 @@ struct T : virtual X {
 
 struct P : T, Y {
   // CHECK-LABEL: VFTable for 'vdtors::X' in 'vdtors::T' in 'vdtors::P' (2 entries).
-  // CHECK-NEXT: 0 | vdtors::P::~P() [scalar deleting]
+  // CHECK-NEXT: 0 | vdtors::P::~P() [vector deleting]
   // CHECK-NEXT: 1 | void vdtors::X::zzz()
 
   // CHECK-NOT: Thunks for 'vdtors::P::~P()'
@@ -574,18 +574,18 @@ struct Q {
 // PR19172: Yet another diamond we miscompiled.
 struct R : virtual Q, X {
   // CHECK-LABEL: VFTable for 'vdtors::Q' in 'vdtors::R' (1 entry).
-  // CHECK-NEXT: 0 | vdtors::R::~R() [scalar deleting]
+  // CHECK-NEXT: 0 | vdtors::R::~R() [vector deleting]
   // CHECK-NEXT:     [this adjustment: -8 non-virtual]
 
   // CHECK-LABEL: Thunks for 'vdtors::R::~R()' (1 entry).
   // CHECK-NEXT: 0 | [this adjustment: -8 non-virtual]
 
   // CHECK-LABEL: VFTable for 'vdtors::X' in 'vdtors::R' (2 entries).
-  // CHECK-NEXT: 0 | vdtors::R::~R() [scalar deleting]
+  // CHECK-NEXT: 0 | vdtors::R::~R() [vector deleting]
   // CHECK-NEXT: 1 | void vdtors::X::zzz()
 
   // CHECK-LABEL: VFTable indices for 'vdtors::R' (1 entry).
-  // CHECK-NEXT: 0 | vdtors::R::~R() [scalar deleting]
+  // CHECK-NEXT: 0 | vdtors::R::~R() [vector deleting]
   virtual ~R();
 };
 
diff --git a/clang/test/CodeGenCXX/microsoft-no-rtti-data.cpp b/clang/test/CodeGenCXX/microsoft-no-rtti-data.cpp
index 069f0226ab948..c8e374e51a031 100644
--- a/clang/test/CodeGenCXX/microsoft-no-rtti-data.cpp
+++ b/clang/test/CodeGenCXX/microsoft-no-rtti-data.cpp
@@ -2,7 +2,7 @@
 
 // vftable shouldn't have RTTI data in it.
 // CHECK-NOT: @"??_R4S@@6B@"
-// CHECK: @"??_7S@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_GS@@UAEPAXI@Z"] }, comdat
+// CHECK: @"??_7S@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_ES@@UAEPAXI@Z"] }, comdat
 
 struct type_info;
 namespace std { using ::type_info; }
diff --git a/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp b/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp
new file mode 100644
index 0000000000000..e8012abb79aee
--- /dev/null
+++ b/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp
@@ -0,0 +1,336 @@
+// RUN: %clang_cc1 -emit-llvm -fms-extensions %s -triple=x86_64-pc-windows-msvc -o - | FileCheck --check-prefixes=X64,CHECK %s
+// RUN: %clang_cc1 -emit-llvm -fms-extensions %s -triple=i386-pc-windows-msvc -o - | FileCheck --check-prefixes=X86,CHECK %s
+// RUN: %clang_cc1 -emit-llvm -fms-extensions %s -triple=x86_64-pc-windows-msvc -fclang-abi-compat=21 -o - | FileCheck --check-prefixes=CLANG21 %s
+
+struct Bird {
+  virtual ~Bird();
+};
+
+struct Parrot : public Bird {
+// X64: @[[ParrotVtable:[0-9]+]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4Parrot@@6B@", ptr @"??_EParrot@@UEAAPEAXI@Z"] }, comdat($"??_7Parrot@@6B@")
+// X86: @[[ParrotVtable:[0-9]+]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4Parrot@@6B@", ptr @"??_EParrot@@UAEPAXI@Z"] }, comdat($"??_7Parrot@@6B@")
+// CLANG21: @[[ParrotVtable:[0-9]+]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4Parrot@@6B@", ptr @"??_GParrot@@UEAAPEAXI@Z"] }, comdat($"??_7Parrot@@6B@")
+// X64: @[[Bird:[0-9]+]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4Bird@@6B@", ptr @"??_EBird@@UEAAPEAXI@Z"] }, comdat($"??_7Bird@@6B@")
+// X86: @[[Bird:[0-9]+]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4Bird@@6B@", ptr @"??_EBird@@UAEPAXI@Z"] }, comdat($"??_7Bird@@6B@")
+// CLANG21: @[[Bird:[0-9]+]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4Bird@@6B@", ptr @"??_GBird@@UEAAPEAXI@Z"] }, comdat($"??_7Bird@@6B@")
+  virtual ~Parrot() {}
+};
+
+Bird::~Bird() {}
+
+// For the weird bird we first emit scalar deleting destructor, then find out
+// that we need vector deleting destructor and remove the alias.
+struct JustAWeirdBird {
+  virtual ~JustAWeirdBird() {}
+
+  bool doSmth(int n) {
+    JustAWeirdBird *c = new JustAWeirdBird[n];
+
+    delete[] c;
+    return true;
+  }
+};
+
+int i = 0;
+struct HasOperatorDelete : public Bird{
+~HasOperatorDelete() { }
+void operator delete(void *p) { i-=2; }
+void operator delete[](void *p) { i--; }
+};
+
+struct AllocatedAsArray : public Bird {
+
+};
+
+// Vector deleting dtor for Bird is an alias because no new Bird[] expressions
+// in the TU.
+// X64: @"??_EBird@@UEAAPEAXI@Z" = weak dso_local unnamed_addr alias ptr (ptr, i32), ptr @"??_GBird@@UEAAPEAXI@Z"
+// X86: @"??_EBird@@UAEPAXI@Z" = weak dso_local unnamed_addr alias ptr (ptr, i32), ptr @"??_GBird@@UAEPAXI@Z"
+// No scalar destructor for Parrot.
+// CHECK-NOT: @"??_GParrot"
+// No vector destructor definition for Bird.
+// CHECK-NOT: define{{.*}}@"??_EBird"
+// No scalar deleting dtor for JustAWeirdBird.
+// CHECK-NOT: @"??_GJustAWeirdBird"
+// CLANG21-NOT: @"??_E
+
+void dealloc(Bird *p) {
+  delete[] p;
+}
+
+Bird* alloc() {
+  Parrot* P = new Parrot[38];
+  return P;
+}
+
+
+template<class C>
+struct S {
+  void foo() { void *p = new C(); delete (C *)p; }
+};
+
+S<AllocatedAsArray[1][3]> sp;
+
+void bar() {
+  dealloc(alloc());
+
+  JustAWeirdBird B;
+  B.doSmth(38);
+
+  Bird *p = new HasOperatorDelete[2];
+  dealloc(p);
+
+  sp.foo();
+}
+
+// CHECK-LABEL: define dso_local void @{{.*}}dealloc{{.*}}(
+// CHECK-SAME: ptr noundef %[[PTR:.*]])
+// CHECK: entry:
+// CHECK-NEXT:   %[[PTRADDR:.*]] = alloca ptr
+// CHECK-NEXT:   store ptr %[[PTR]], ptr %[[PTRADDR]]
+// CHECK-NEXT:   %[[LPTR:.*]] = load ptr, ptr %[[PTRADDR]]
+// CHECK-NEXT:   %[[ISNULL:.*]] = icmp eq ptr %[[LPTR]], null
+// CHECK-NEXT:   br i1 %[[ISNULL]], label %delete.end, label %delete.notnull
+// CHECK: delete.notnull:
+// X64-NEXT:   %[[COOKIEGEP:.*]] = getelementptr inbounds i8, ptr %[[LPTR]], i64 -8
+// X86-NEXT:   %[[COOKIEGEP:.*]] = getelementptr inbounds i8, ptr %[[LPTR]], i32 -4
+// X64-NEXT:   %[[HOWMANY:.*]] = load i64, ptr %[[COOKIEGEP]]
+// X86-NEXT:   %[[HOWMANY:.*]] = load i32, ptr %[[COOKIEGEP]]
+// X64-NEXT:   %[[ISNOELEM:.*]] = icmp eq i64 %2, 0
+// X86-NEXT:   %[[ISNOELEM:.*]] = icmp eq i32 %2, 0
+// CHECK-NEXT:   br i1 %[[ISNOELEM]], label %vdtor.nocall, label %vdtor.call
+// CHECK: vdtor.nocall:
+// X64-NEXT:   %[[HOWMANYBYTES:.*]] = mul i64 8, %[[HOWMANY]]
+// X86-NEXT:   %[[HOWMANYBYTES:.*]] = mul i32 4, %[[HOWMANY]]
+// X64-NEXT:   %[[ADDCOOKIESIZE:.*]] = add i64 %[[HOWMANYBYTES]], 8
+// X86-NEXT:   %[[ADDCOOKIESIZE:.*]] = add i32 %[[HOWMANYBYTES]], 4
+// X64-NEXT:   call void @"??_V@YAXPEAX_K@Z"(ptr noundef %[[COOKIEGEP]], i64 noundef %[[ADDCOOKIESIZE]])
+// X86-NEXT:   call void @"??_V@YAXPAXI@Z"(ptr noundef %[[COOKIEGEP]], i32 noundef %[[ADDCOOKIESIZE]])
+// CHECK-NEXT:   br label %delete.end
+// CHECK: vdtor.call:
+// CHECK-NEXT:   %[[VTABLE:.*]] = load ptr, ptr %[[LPTR]]
+// CHECK-NEXT:   %[[FPGEP:.*]] = getelementptr inbounds ptr, ptr %[[VTABLE]], i64 0
+// CHECK-NEXT:   %[[FPLOAD:.*]]  = load ptr, ptr %[[FPGEP]]
+// X64-NEXT:   %[[CALL:.*]] = call noundef ptr %[[FPLOAD]](ptr noundef nonnull align 8 dereferenceable(8) %[[LPTR]], i32 noundef 3)
+// X86-NEXT:   %[[CALL:.*]] = call x86_thiscallcc noundef ptr %[[FPLOAD]](ptr noundef nonnull align 4 dereferenceable(4) %[[LPTR]], i32 noundef 3)
+// CHECK-NEXT:   br label %delete.end
+// CHECK: delete.end:
+// CHECK-NEXT:   ret void
+
+// Normal loop over the array elements for clang21 ABI
+// CLANG21-LABEL: define dso_local void @"?dealloc@@YAXPEAUBird@@@Z"
+// CLANG21:   %p.addr = alloca ptr
+// CLANG21-NEXT:   store ptr %p, ptr %p.addr
+// CLANG21-NEXT:   %0 = load ptr, ptr %p.addr
+// CLANG21-NEXT:   %isnull = icmp eq ptr %0, null
+// CLANG21-NEXT:   br i1 %isnull, label %delete.end2, label %delete.notnull
+// CLANG21: delete.notnull:
+// CLANG21-NEXT:   %1 = getelementptr inbounds i8, ptr %0, i64 -8
+// CLANG21-NEXT:   %2 = load i64, ptr %1
+// CLANG21-NEXT:   %delete.end = getelementptr inbounds %struct.Bird, ptr %0, i64 %2
+// CLANG21-NEXT:   %arraydestroy.isempty = icmp eq ptr %0, %delete.end
+// CLANG21-NEXT:   br i1 %arraydestroy.isempty, label %arraydestroy.done1, label %arraydestroy.body
+// CLANG21: arraydestroy.body:
+// CLANG21-NEXT:   %arraydestroy.elementPast = phi ptr [ %delete.end, %delete.notnull ], [ %arraydestroy.element, %arraydestroy.body ]
+// CLANG21-NEXT:   %arraydestroy.element = getelementptr inbounds %struct.Bird, ptr %arraydestroy.elementPast, i64 -1
+// CLANG21-NEXT:   call void @"??1Bird@@UEAA@XZ"(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
+// CLANG21-NEXT:   %arraydestroy.done = icmp eq ptr %arraydestroy.element, %0
+// CLANG21-NEXT:   br i1 %arraydestroy.done, label %arraydestroy.done1, label %arraydestroy.body
+// CLANG21: arraydestroy.done1:
+// CLANG21-NEXT:   %3 = mul i64 8, %2
+// CLANG21-NEXT:   %4 = add i64 %3, 8
+// CLANG21-NEXT:   call void @"??_V@YAXPEAX_K@Z"(ptr noundef %1, i64 noundef %4)
+// CLANG21-NEXT:   br label %delete.end2
+
+// Definition of S::foo, check that it has vector deleting destructor call
+// X64-LABEL: define linkonce_odr dso_local void @"?foo@?$S@$$BY102UAllocatedAsArray@@@@QEAAXXZ"
+// X86-LABEL: define linkonce_odr dso_local x86_thiscallcc void @"?foo@?$S@$$BY102UAllocatedAsArray@@@@QAEXXZ"
+// X64: %[[NEWCALL:.*]] = call noalias noundef nonnull ptr @"??_U@YAPEAX_K@Z"(i64 noundef 32)
+// X86: %[[NEWCALL:.*]] = call noalias noundef nonnull ptr @"??_U@YAPAXI@Z"(i32 noundef 16)
+// X64: %[[ARR:.*]] = getelementptr inbounds i8, ptr %[[NEWCALL]], i64 8
+// X86: %[[ARR:.*]] = getelementptr inbounds i8, ptr %[[NEWCALL]], i32 4
+// CHECK: store ptr %[[ARR]], ptr %[[DP:.*]]
+// CHECK: %[[DEL_PTR:.*]] = load ptr, ptr %[[DP:.*]]
+// CHECK: delete.notnull:
+// X64-NEXT:   %[[COOKIEGEP:.*]] = getelementptr inbounds i8, ptr %[[DEL_PTR]], i64 -8
+// X86-NEXT:   %[[COOKIEGEP:.*]] = getelementptr inbounds i8, ptr %[[DEL_PTR]], i32 -4
+// X64-NEXT:   %[[HOWMANY:.*]] = load i64, ptr %[[COOKIEGEP]]
+// X86-NEXT:   %[[HOWMANY:.*]] = load i32, ptr %[[COOKIEGEP]]
+// X64-NEXT:   %[[ISNOELEM:.*]] = icmp eq i64 %[[HOWMANY]], 0
+// X86-NEXT:   %[[ISNOELEM:.*]] = icmp eq i32 %[[HOWMANY]], 0
+// CHECK-NEXT:   br i1 %[[ISNOELEM]], label %vdtor.nocall, label %vdtor.call
+// CHECK: vdtor.nocall:                                     ; preds = %delete.notnull
+// X64-NEXT:   %[[HOWMANYBYTES:.*]] = mul i64 8, %[[HOWMANY]]
+// X86-NEXT:   %[[HOWMANYBYTES:.*]] = mul i32 4, %[[HOWMANY]]
+// X64-NEXT:   %[[ADDCOOKIESIZE:.*]] = add i64 %[[HOWMANYBYTES]], 8
+// X86-NEXT:   %[[ADDCOOKIESIZE:.*]] = add i32 %[[HOWMANYBYTES]], 4
+// X64-NEXT:   call void @"??_V@YAXPEAX_K@Z"(ptr noundef %[[COOKIEGEP]], i64 noundef %[[ADDCOOKIESIZE]])
+// X86-NEXT:   call void @"??_V@YAXPAXI@Z"(ptr noundef %[[COOKIEGEP]], i32 noundef %[[ADDCOOKIESIZE]])
+// CHECK-NEXT:   br label %delete.end
+// CHECK: vdtor.call:                                       ; preds = %delete.notnull
+// CHECK-NEXT:   %[[VTABLE:.*]] = load ptr, ptr %[[DEL_PTR]]
+// CHECK-NEXT:   %[[FPGEP:.*]] = getelementptr inbounds ptr, ptr %[[VTABLE]], i64 0
+// CHECK-NEXT:   %[[FPLOAD:.*]]  = load ptr, ptr %[[FPGEP]]
+// X64-NEXT:   %[[CALL:.*]] = call noundef ptr %[[FPLOAD]](ptr noundef nonnull align 8 dereferenceable(8) %[[DEL_PTR]], i32 noundef 3)
+// X86-NEXT:   %[[CALL:.*]] = call x86_thiscallcc noundef ptr %[[FPLOAD]](ptr noundef nonnull align 4 dereferenceable(4) %[[DEL_PTR]], i32 noundef 3)
+// CHECK-NEXT:   br label %delete.end
+// CHECK: delete.end:
+// CHECK-NEXT:   ret void
+
+// Vector dtor definition for Parrot.
+// X64-LABEL: define weak dso_local noundef ptr @"??_EParrot@@UEAAPEAXI@Z"(
+// X64-SAME: ptr {{.*}} %[[THIS:.*]], i32 {{.*}} %[[IMPLICIT_PARAM:.*]]) unnamed_addr
+// X86-LABEL: define weak dso_local x86_thiscallcc noundef ptr @"??_EParrot@@UAEPAXI@Z"(
+// X86-SAME: ptr noundef nonnull align 4 dereferenceable(4) %[[THIS:.*]], i32 noundef %[[IMPLICIT_PARAM:.*]]) unnamed_addr
+// CHECK: entry:
+// CHECK-NEXT:   %[[RET:.*]] = alloca ptr
+// CHECK-NEXT:   %[[IPADDR:.*]] = alloca i32
+// CHECK-NEXT:   %[[THISADDR:.*]] = alloca ptr
+// CHECK-NEXT:   store i32 %[[IMPLICIT_PARAM]], ptr %[[IPADDR]]
+// CHECK-NEXT:   store ptr %[[THIS]], ptr %[[THISADDR]]
+// CHECK-NEXT:   %[[LTHIS:.*]] = load ptr, ptr %[[THISADDR]]
+// CHECK-NEXT:   store ptr %[[LTHIS]], ptr %[[RET]]
+// CHECK-NEXT:   %[[LIP:.*]] = load i32, ptr %[[IPADDR]]
+// CHECK-NEXT:   %[[SECONDBIT:.*]] = and i32 %[[LIP]], 2
+// CHECK-NEXT:   %[[ISSECONDBITZERO:.*]] = icmp eq i32 %[[SECONDBIT]], 0
+// CHECK-NEXT:   br i1 %[[ISSECONDBITZERO:.*]], label %dtor.scalar, label %dtor.vector
+// CHECK: dtor.vector:
+// X64-NEXT:   %[[COOKIEGEP:.*]] = getelementptr inbounds i8, ptr %[[LTHIS]], i64 -8
+// X86-NEXT:   %[[COOKIEGEP:.*]] = getelementptr inbounds i8, ptr %[[LTHIS]], i32 -4
+// X64-NEXT:   %[[HOWMANY:.*]] = load i64, ptr %[[COOKIEGEP]]
+// X86-NEXT:   %[[HOWMANY:.*]] = load i32, ptr %[[COOKIEGEP]]
+// X64-NEXT:   %[[END:.*]] = getelementptr inbounds %struct.Parrot, ptr %[[LTHIS]], i64 %[[HOWMANY]]
+// X86-NEXT:   %[[END:.*]] = getelementptr inbounds %struct.Parrot, ptr %[[LTHIS]], i32 %[[HOWMANY]]
+// CHECK-NEXT:   br label %arraydestroy.body
+// CHECK: arraydestroy.body:
+// CHECK-NEXT:   %[[PASTELEM:.*]] = phi ptr [ %delete.end, %dtor.vector ], [ %arraydestroy.element, %arraydestroy.body ]
+// X64-NEXT:   %[[CURELEM:.*]] = getelementptr inbounds %struct.Parrot, ptr %[[PASTELEM]], i64 -1
+// X86-NEXT:   %[[CURELEM:.*]] = getelementptr inbounds %struct.Parrot, ptr %[[PASTELEM]], i32 -1
+// X64-NEXT:   call void @"??1Parrot@@UEAA@XZ"(ptr noundef nonnull align 8 dereferenceable(8) %[[CURELEM]])
+// X86-NEXT:   call x86_thiscallcc void @"??1Parrot@@UAE@XZ"(ptr noundef nonnull align 4 dereferenceable(4) %[[CURELEM]])
+// CHECK-NEXT:   %[[DONE:.*]] = icmp eq ptr %[[CURELEM]], %[[LTHIS]]
+// CHECK-NEXT:   br i1 %[[DONE]], label %arraydestroy.done3, label %arraydestroy.body
+// CHECK: arraydestroy.done3:
+// CHECK-NEXT:   br label %dtor.vector.cont
+// CHECK: dtor.vector.cont:
+// CHECK-NEXT:   %[[FIRSTBIT:.*]] = and i32 %[[LIP]], 1
+// CHECK-NEXT:   %[[ISFIRSTBITZERO:.*]] = icmp eq i32 %[[FIRSTBIT]], 0
+// CHECK-NEXT:   br i1 %[[ISFIRSTBITZERO]], label %dtor.continue, label %dtor.call_delete_after_array_destroy
+// CHECK: dtor.call_delete_after_array_destroy:
+// X64-NEXT:     call void @"??_V@YAXPEAX_K@Z"(ptr noundef %[[COOKIEGEP]], i64 noundef 8)
+// X86-NEXT:     call void @"??_V@YAXPAXI@Z"(ptr noundef %[[COOKIEGEP]], i32 noundef 4)
+// CHECK-NEXT:   br label %dtor.continue
+// CHECK: dtor.scalar:
+// X64-NEXT:   call void @"??1Parrot@@UEAA@XZ"(ptr noundef nonnull align 8 dereferenceable(8) %[[LTHIS]])
+// X86-NEXT:   call x86_thiscallcc void @"??1Parrot@@UAE@XZ"(ptr noundef nonnull align 4 dereferenceable(4) %[[LTHIS]])
+// CHECK-NEXT:   %[[FIRSTBIT:.*]] = and i32 %[[LIP]], 1
+// CHECK-NEXT:   %[[ISFIRSTBITZERO:.*]] = icmp eq i32 %[[FIRSTBIT]], 0
+// CHECK-NEXT:   br i1 %[[ISFIRSTBITZERO]], label %dtor.continue, label %dtor.call_delete
+// CHECK: dtor.call_delete:
+// X64-NEXT:     call void @"??3@YAXPEAX_K@Z"(ptr noundef %[[LTHIS]], i64 noundef 8)
+// X86-NEXT:     call void @"??3@YAXPAXI@Z"(ptr noundef %[[LTHIS]], i32 noundef 4)
+// CHECK-NEXT:   br label %dtor.continue
+// CHECK: dtor.continue:
+// CHECK-NEXT:   %[[LOADRET:.*]] = load ptr, ptr %[[RET]]
+// CHECK-NEXT:   ret ptr %[[LOADRET]]
+
+// X64: define weak dso_local noundef ptr @"??_EJustAWeirdBird@@UEAAPEAXI@Z"(
+// X64-SAME: ptr noundef nonnull align 8 dereferenceable(8) %this, i32 noundef %should_call_delete)
+// CLANG21: define linkonce_odr dso_local noundef ptr @"??_GJustAWeirdBird@@UEAAPEAXI@Z"(
+// X86: define weak dso_local x86_thiscallcc noundef ptr @"??_EJustAWeirdBird@@UAEPAXI@Z"(
+// X86-SAME: ptr noundef nonnull align 4 dereferenceable(4) %this, i32 noundef %should_call_delete) unnamed_addr
+
+// X64-LABEL: define weak dso_local noundef ptr @"??_EHasOperatorDelete@@UEAAPEAXI@Z"
+// X86-LABEL: define weak dso_local x86_thiscallcc noundef ptr @"??_EHasOperatorDelete@@UAEPAXI@Z"
+// CLANG21: define linkonce_odr dso_local noundef ptr @"??_GHasOperatorDelete@@UEAAPEAXI@Z"
+// CHECK: dtor.call_delete_after_array_destroy:
+// CHECK-NEXT: %[[SHOULD_CALL_GLOB_DELETE:.*]] = and i32 %should_call_delete2, 4
+// CHECK-NEXT: %[[CHK:.*]] = icmp eq i32 %[[SHOULD_CALL_GLOB_DELETE]], 0
+// CHECK-NEXT: br i1 %[[CHK]], label %dtor.call_class_delete_after_array_destroy, label %dtor.call_glob_delete_after_array_destroy
+// CHECK: dtor.call_class_delete_after_array_destroy:
+// X64-NEXT:   call void @"??_VHasOperatorDelete@@SAXPEAX@Z"(ptr noundef %2)
+// X86-NEXT: call void @"??_VHasOperatorDelete@@SAXPAX@Z"
+// CHECK-NEXT:   br label %dtor.continue
+// CHECK: dtor.call_glob_delete_after_array_destroy:
+// X64-NEXT:   call void @"??_V@YAXPEAX_K@Z"(ptr noundef %2, i64 noundef 8)
+// X86-NEXT:   call void @"??_V@YAXPAXI@Z"(ptr noundef %2, i32 noundef 4)
+// CHECK-NEXT:   br label %dtor.continue
+
+
+
+struct BaseDelete1 {
+  void operator delete[](void *);
+};
+struct BaseDelete2 {
+  void operator delete[](void *);
+};
+struct BaseDestructor {
+  BaseDestructor() {}
+  virtual ~BaseDestructor() = default;
+};
+
+struct Derived : BaseDelete1, BaseDelete2, BaseDestructor {
+  Derived() {}
+};
+
+void foobartest() {
+    Derived *a = new Derived[10]();
+    ::delete[] a;
+}
+
+// X64-LABEL: define weak dso_local noundef ptr @"??_EDerived@@UEAAPEAXI@Z"(ptr {{.*}} %this, i32 noundef %should_call_delete)
+// X86-LABEL: define weak dso_local x86_thiscallcc noundef ptr @"??_EDerived@@UAEPAXI@Z"(ptr {{.*}} %this, i32 noundef %should_call_delete)
+// CHECK:  %retval = alloca ptr
+// CHECK-NEXT:  %should_call_delete.addr = alloca i32, align 4
+// CHECK-NEXT:  %this.addr = alloca ptr
+// CHECK-NEXT:  store i32 %should_call_delete, ptr %should_call_delete.addr, align 4
+// CHECK-NEXT:  store ptr %this, ptr %this.addr
+// CHECK-NEXT:  %this1 = load ptr, ptr %this.addr
+// CHECK-NEXT:  store ptr %this1, ptr %retval
+// CHECK-NEXT:  %should_call_delete2 = load i32, ptr %should_call_delete.addr, align 4
+// CHECK-NEXT:  %0 = and i32 %should_call_delete2, 2
+// CHECK-NEXT:  %1 = icmp eq i32 %0, 0
+// CHECK-NEXT:  br i1 %1, label %dtor.scalar, label %dtor.vector
+// CHECK: dtor.vector:
+// X64-NEXT:  %2 = getelementptr inbounds i8, ptr %this1, i64 -8
+// X86-NEXT:  %2 = getelementptr inbounds i8, ptr %this1, i32 -4
+// X64-NEXT:  %3 = load i64, ptr %2
+// X86-NEXT:  %3 = load i32, ptr %2
+// X64-NEXT:  %delete.end = getelementptr inbounds %struct.Derived, ptr %this1, i64 %3
+// X86-NEXT:  %delete.end = getelementptr inbounds %struct.Derived, ptr %this1, i32 %3
+// CHECK-NEXT:  br label %arraydestroy.body
+// CHECK: arraydestroy.body:
+// CHECK-NEXT:  %arraydestroy.elementPast = phi ptr [ %delete.end, %dtor.vector ], [ %arraydestroy.element, %arraydestroy.body ]
+// X64-NEXT:  %arraydestroy.element = getelementptr inbounds %struct.Derived, ptr %arraydestroy.elementPast, i64 -1
+// X86-NEXT:  %arraydestroy.element = getelementptr inbounds %struct.Derived, ptr %arraydestroy.elementPast, i32 -1
+// X64-NEXT:  call void @"??1Derived@@UEAA@XZ"(ptr noundef nonnull align 8 dereferenceable(16) %arraydestroy.element)
+// X86-NEXT:  call x86_thiscallcc void @"??1Derived@@UAE@XZ"(ptr noundef nonnull align 4 dereferenceable(8) %arraydestroy.element)
+// CHECK-NEXT:  %arraydestroy.done = icmp eq ptr %arraydestroy.element, %this1
+// CHECK-NEXT:  br i1 %arraydestroy.done, label %arraydestroy.done3, label %arraydestroy.body
+// CHECK: arraydestroy.done3:
+// CHECK-NEXT:  br label %dtor.vector.cont
+// CHECK: dtor.vector.cont:
+// CHECK-NEXT:  %4 = and i32 %should_call_delete2, 1
+// CHECK-NEXT:  %5 = icmp eq i32 %4, 0
+// CHECK-NEXT:  br i1 %5, label %dtor.continue, label %dtor.call_delete_after_array_destroy
+// CHECK: dtor.call_delete_after_array_destroy:
+// X64-NEXT:  call void @"??_V@YAXPEAX_K@Z"(ptr noundef %2, i64 noundef 16)
+// X86-NEXT:  call void @"??_V@YAXPAXI@Z"(ptr noundef %2, i32 noundef 8)
+// CHECK-NEXT:  br label %dtor.continue
+// CHECK: dtor.scalar:
+// X64-NEXT:  call void @"??1Derived@@UEAA@XZ"(ptr noundef nonnull align 8 dereferenceable(16) %this1)
+// X86-NEXT:  call x86_thiscallcc void @"??1Derived@@UAE@XZ"(ptr noundef nonnull align 4 dereferenceable(8) %this1)
+// CHECK-NEXT:  %6 = and i32 %should_call_delete2, 1
+// CHECK-NEXT:  %7 = icmp eq i32 %6, 0
+// CHECK-NEXT:  br i1 %7, label %dtor.continue, label %dtor.call_delete
+// CHECK: dtor.call_delete:
+// X64-NEXT:  call void @"??3@YAXPEAX_K@Z"(ptr noundef %this1, i64 noundef 16)
+// X86-NEXT:  call void @"??3@YAXPAXI@Z"(ptr noundef %this1, i32 noundef 8)
+// CHECK-NEXT:  br label %dtor.continue
+// CHECK: dtor.continue:
+// CHECK-NEXT:  %8 = load ptr, ptr %retval
+// CHECK-NEXT:  ret ptr %8
+
+// X64: define weak dso_local noundef ptr @"??_EAllocatedAsArray@@UEAAPEAXI@Z"
+// X86: define weak dso_local x86_thiscallcc noundef ptr @"??_EAllocatedAsArray@@UAEPAXI@Z"
+// CLANG21: define linkonce_odr dso_local noundef ptr @"??_GAllocatedAsArray@@UEAAPEAXI@Z"
diff --git a/clang/test/CodeGenCXX/vtable-consteval.cpp b/clang/test/CodeGenCXX/vtable-consteval.cpp
index 1454f6fde357d..220143465c574 100644
--- a/clang/test/CodeGenCXX/vtable-consteval.cpp
+++ b/clang/test/CodeGenCXX/vtable-consteval.cpp
@@ -26,7 +26,7 @@ struct B {
 B b;
 
 // ITANIUM-DAG: @_ZTV1C = {{.*}} constant { [4 x ptr] } {{.*}} null, ptr @_ZTI1C, ptr @_ZN1CD1Ev, ptr @_ZN1CD0Ev
-// MSABI-DAG: @[[C_VFTABLE:.*]] = {{.*}} constant { [2 x ptr] } {{.*}} @"??_R4C@@6B@", ptr @"??_GC@@UEAAPEAXI@Z"
+// MSABI-DAG: @[[C_VFTABLE:.*]] = {{.*}} constant { [2 x ptr] } {{.*}} @"??_R4C@@6B@", ptr @"??_EC@@UEAAPEAXI@Z"
 struct C {
   virtual ~C() = default;
   virtual consteval C &operator=(const C&) = default;
@@ -36,7 +36,7 @@ struct C {
 C c;
 
 // ITANIUM-DAG: @_ZTV1D = {{.*}} constant { [4 x ptr] } {{.*}} null, ptr @_ZTI1D, ptr @_ZN1DD1Ev, ptr @_ZN1DD0Ev
-// MSABI-DAG: @[[D_VFTABLE:.*]] = {{.*}} constant { [2 x ptr] } {{.*}} @"??_R4D@@6B@", ptr @"??_GD@@UEAAPEAXI@Z"
+// MSABI-DAG: @[[D_VFTABLE:.*]] = {{.*}} constant { [2 x ptr] } {{.*}} @"??_R4D@@6B@", ptr @"??_ED@@UEAAPEAXI@Z"
 struct D : C {};
 // ITANIUM-DAG: @d = {{.*}}global { ptr } { {{.*}} @_ZTV1D,
 // MSABI-DAG: @"?d@@3UD@@A" = {{.*}}global { ptr } { ptr @"??_7D@@6B@" }
diff --git a/clang/test/DebugInfo/CXX/windows-dtor.cpp b/clang/test/DebugInfo/CXX/windows-dtor.cpp
index beea56ce7368b..ffef45b9f7d1b 100644
--- a/clang/test/DebugInfo/CXX/windows-dtor.cpp
+++ b/clang/test/DebugInfo/CXX/windows-dtor.cpp
@@ -16,7 +16,7 @@ struct AB: A, B {
 template struct AB<int>;
 
 // CHECK: define {{.*}}@"??_E?$AB@H@@W3AEPAXI@Z"({{.*}} !dbg [[THUNK_VEC_DEL_DTOR:![0-9]*]]
-// CHECK: call {{.*}}@"??_G?$AB@H@@UAEPAXI@Z"({{.*}}) #{{[0-9]*}}, !dbg [[THUNK_LOC:![0-9]*]]
+// CHECK: call {{.*}}@"??_E?$AB@H@@UAEPAXI@Z"({{.*}}) #{{[0-9]*}}, !dbg [[THUNK_LOC:![0-9]*]]
 // CHECK: define
 
 // CHECK: [[THUNK_VEC_DEL_DTOR]] = distinct !DISubprogram
diff --git a/clang/test/Modules/Inputs/msvc-vector-deleting-dtors/module.modulemap b/clang/test/Modules/Inputs/msvc-vector-deleting-dtors/module.modulemap
new file mode 100644
index 0000000000000..bb7ff1c9952c8
--- /dev/null
+++ b/clang/test/Modules/Inputs/msvc-vector-deleting-dtors/module.modulemap
@@ -0,0 +1 @@
+module msvc_vector_deleting_destructors { header "msvc-vector-deleting-dtors.h" export * }
diff --git a/clang/test/Modules/Inputs/msvc-vector-deleting-dtors/msvc-vector-deleting-dtors.h b/clang/test/Modules/Inputs/msvc-vector-deleting-dtors/msvc-vector-deleting-dtors.h
new file mode 100644
index 0000000000000..55492667e39d7
--- /dev/null
+++ b/clang/test/Modules/Inputs/msvc-vector-deleting-dtors/msvc-vector-deleting-dtors.h
@@ -0,0 +1,16 @@
+class Base1 {
+public:
+  void operator delete[](void *);
+};
+class Base2 {
+public:
+  void operator delete(void *);
+};
+struct Derived : Base1, Base2 {
+  virtual ~Derived() {}
+};
+void in_h_tests(Derived *p, Derived *p1) {
+  ::delete[] p;
+
+  delete[] p1;
+}
diff --git a/clang/test/Modules/msvc-vector-deleting-destructors.cpp b/clang/test/Modules/msvc-vector-deleting-destructors.cpp
new file mode 100644
index 0000000000000..a0806054355db
--- /dev/null
+++ b/clang/test/Modules/msvc-vector-deleting-destructors.cpp
@@ -0,0 +1,30 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps %s -x c++ -fmodules-cache-path=%t -I %S/Inputs/msvc-vector-deleting-dtors -emit-llvm -triple=i386-pc-win32 -o - | FileCheck %s --check-prefixes CHECK,CHECK32
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps %s -x c++ -fmodules-cache-path=%t -I %S/Inputs/msvc-vector-deleting-dtors -emit-llvm -triple=x86_64-pc-win32 -o - | FileCheck %s --check-prefixes CHECK,CHECK64
+
+#include "msvc-vector-deleting-dtors.h"
+
+void call_in_module_function(void) {
+    in_h_tests(new Derived[2], new Derived[3]);
+}
+
+void out_of_module_tests(Derived *p, Derived *p1) {
+  ::delete[] p;
+
+  delete[] p1;
+}
+
+// CHECK32-LABEL: define weak dso_local x86_thiscallcc noundef ptr @"??_EDerived@@UAEPAXI@Z"
+// CHECK64-LABEL: define weak dso_local noundef ptr @"??_EDerived@@UEAAPEAXI@Z"
+// CHECK: dtor.call_class_delete_after_array_destroy:
+// CHECK32-NEXT:  call void @"??_VBase1@@SAXPAX@Z"(ptr noundef %2)
+// CHECK64-NEXT:  call void @"??_VBase1@@SAXPEAX@Z"(ptr noundef %2)
+// CHECK: dtor.call_glob_delete_after_array_destroy:
+// CHECK32-NEXT:   call void @"??_V@YAXPAXI@Z"(ptr noundef %2, i32 noundef 8)
+// CHECK64-NEXT:   call void @"??_V@YAXPEAX_K@Z"(ptr noundef %2, i64 noundef 16)
+// CHECK: dtor.call_glob_delete:
+// CHECK32-NEXT:   call void @"??3@YAXPAXI@Z"(ptr noundef %this1, i32 noundef 8)
+// CHECK64-NEXT:   call void @"??3@YAXPEAX_K@Z"(ptr noundef %this1, i64 noundef 16)
+// CHECK: dtor.call_class_delete:
+// CHECK32-NEXT:   call void @"??3Base2@@SAXPAX@Z"(ptr noundef %this1)
+// CHECK64-NEXT:   call void @"??3Base2@@SAXPEAX@Z"(ptr noundef %this1)
diff --git a/clang/test/Modules/vtable-windows.cppm b/clang/test/Modules/vtable-windows.cppm
index dbde24c8a9bdd..e45e32d6b4d60 100644
--- a/clang/test/Modules/vtable-windows.cppm
+++ b/clang/test/Modules/vtable-windows.cppm
@@ -23,4 +23,4 @@ void test() {
 
 // Check that the virtual table is an unnamed_addr constant in comdat that can
 // be merged with the virtual table with other TUs.
-// CHECK: unnamed_addr constant {{.*}}[ptr @"??_R4Fruit@@6B@", ptr @"??_GFruit@@UAEPAXI@Z", ptr @"?eval@Fruit@@UAEXXZ"{{.*}}comdat($"??_7Fruit@@6B@")
+// CHECK: unnamed_addr constant {{.*}}[ptr @"??_R4Fruit@@6B@", ptr @"??_EFruit@@UAEPAXI@Z", ptr @"?eval@Fruit@@UAEXXZ"{{.*}}comdat($"??_7Fruit@@6B@")
diff --git a/clang/test/PCH/Inputs/msvc-vector-deleting-dtors.h b/clang/test/PCH/Inputs/msvc-vector-deleting-dtors.h
new file mode 100644
index 0000000000000..55492667e39d7
--- /dev/null
+++ b/clang/test/PCH/Inputs/msvc-vector-deleting-dtors.h
@@ -0,0 +1,16 @@
+class Base1 {
+public:
+  void operator delete[](void *);
+};
+class Base2 {
+public:
+  void operator delete(void *);
+};
+struct Derived : Base1, Base2 {
+  virtual ~Derived() {}
+};
+void in_h_tests(Derived *p, Derived *p1) {
+  ::delete[] p;
+
+  delete[] p1;
+}
diff --git a/clang/test/PCH/msvc-vector-deleting-destructors.cpp b/clang/test/PCH/msvc-vector-deleting-destructors.cpp
new file mode 100644
index 0000000000000..f548dba8efd20
--- /dev/null
+++ b/clang/test/PCH/msvc-vector-deleting-destructors.cpp
@@ -0,0 +1,34 @@
+// Test this without pch.
+// RUN: %clang_cc1 -x c++ -include %S/Inputs/msvc-vector-deleting-dtors.h -emit-llvm -o - %s -triple=i386-pc-win32 | FileCheck %s --check-prefixes CHECK,CHECK32
+// RUN: %clang_cc1 -x c++ -include %S/Inputs/msvc-vector-deleting-dtors.h -emit-llvm -o - %s -triple=x86_64-pc-win32 | FileCheck %s --check-prefixes CHECK,CHECK64
+
+// Test with pch.
+// RUN: %clang_cc1 -x c++ -emit-pch -o %t -triple=i386-pc-win32 %S/Inputs/msvc-vector-deleting-dtors.h
+// RUN: %clang_cc1 -x c++ -include-pch %t -emit-llvm -triple=i386-pc-win32 -o - %s | FileCheck %s --check-prefixes CHECK,CHECK32
+// RUN: %clang_cc1 -x c++ -emit-pch -o %t -triple=x86_64-pc-win32 %S/Inputs/msvc-vector-deleting-dtors.h
+// RUN: %clang_cc1 -x c++ -include-pch %t -emit-llvm -triple=x86_64-pc-win32 -o - %s | FileCheck %s --check-prefixes CHECK,CHECK64
+
+void call_in_module_function(void) {
+    in_h_tests(new Derived[2], new Derived[3]);
+}
+
+void out_of_module_tests(Derived *p, Derived *p1) {
+  ::delete[] p;
+
+  delete[] p1;
+}
+
+// CHECK32-LABEL: define weak dso_local x86_thiscallcc noundef ptr @"??_EDerived@@UAEPAXI@Z"
+// CHECK64-LABEL: define weak dso_local noundef ptr @"??_EDerived@@UEAAPEAXI@Z"
+// CHECK: dtor.call_class_delete_after_array_destroy:
+// CHECK32-NEXT:  call void @"??_VBase1@@SAXPAX@Z"(ptr noundef %2)
+// CHECK64-NEXT:  call void @"??_VBase1@@SAXPEAX@Z"(ptr noundef %2)
+// CHECK: dtor.call_glob_delete_after_array_destroy:
+// CHECK32-NEXT:   call void @"??_V@YAXPAXI@Z"(ptr noundef %2, i32 noundef 8)
+// CHECK64-NEXT:   call void @"??_V@YAXPEAX_K@Z"(ptr noundef %2, i64 noundef 16)
+// CHECK: dtor.call_glob_delete:
+// CHECK32-NEXT:   call void @"??3@YAXPAXI@Z"(ptr noundef %this1, i32 noundef 8)
+// CHECK64-NEXT:   call void @"??3@YAXPEAX_K@Z"(ptr noundef %this1, i64 noundef 16)
+// CHECK: dtor.call_class_delete:
+// CHECK32-NEXT:   call void @"??3Base2@@SAXPAX@Z"(ptr noundef %this1)
+// CHECK64-NEXT:   call void @"??3Base2@@SAXPEAX@Z"(ptr noundef %this1)
diff --git a/clang/test/Profile/cxx-abc-deleting-dtor.cpp b/clang/test/Profile/cxx-abc-deleting-dtor.cpp
index c65a8e8013c35..7c2a5bbc93af3 100644
--- a/clang/test/Profile/cxx-abc-deleting-dtor.cpp
+++ b/clang/test/Profile/cxx-abc-deleting-dtor.cpp
@@ -24,16 +24,15 @@ DerivedABC *useABCVTable() { return new DerivedABC(); }
 // MSVC: @"__profn_??1ABC@@{{.*}}" =
 // MSVC-NOT: @"__profn_??_G{{.*}}" =
 
-// MSVC-LABEL: define linkonce_odr dso_local noundef ptr @"??_GDerivedABC@@UEAAPEAXI@Z"(ptr {{[^,]*}} %this, {{.*}})
-// MSVC-NOT:   call void @llvm.instrprof.increment({{.*}})
-// MSVC:   call void @"??1DerivedABC@@UEAA@XZ"({{.*}})
-// MSVC:   ret void
-
 // MSVC-LABEL: define linkonce_odr dso_local noundef ptr @"??_GABC@@UEAAPEAXI@Z"(ptr {{[^,]*}} %this, {{.*}})
 // MSVC-NOT:   call void @llvm.instrprof.increment({{.*}})
 // MSVC:   call void @llvm.trap()
 // MSVC-NEXT:   unreachable
 
+// MSVC-LABEL: define linkonce_odr dso_local noundef ptr @"??_GDerivedABC@@UEAAPEAXI@Z"(ptr {{[^,]*}} %this, {{.*}})
+// MSVC-NOT:   call void @llvm.instrprof.increment({{.*}})
+// MSVC:   call void @"??1DerivedABC@@UEAA@XZ"({{.*}})
+
 // MSVC-LABEL: define linkonce_odr dso_local void @"??1DerivedABC@@UEAA@XZ"({{.*}})
 // MSVC:   call void @llvm.instrprof.increment({{.*}})
 // MSVC:   call void @"??1ABC@@UEAA@XZ"({{.*}})
diff --git a/clang/test/SemaCXX/gh134265.cpp b/clang/test/SemaCXX/gh134265.cpp
new file mode 100644
index 0000000000000..790165411c938
--- /dev/null
+++ b/clang/test/SemaCXX/gh134265.cpp
@@ -0,0 +1,62 @@
+// RUN: %clang_cc1 %s -verify=expected -fsyntax-only -triple=x86_64-unknown-linux-gnu
+// RUN: %clang_cc1 %s -verify=expected -fsyntax-only -triple=x86_64-unknown-linux-gnu -std=c++20
+// RUN: %clang_cc1 %s -verify=expected,ms -fms-extensions -fms-compatibility -triple=x86_64-pc-windows-msvc -DMS
+
+// Verify that clang doesn't emit additional errors when searching for
+// additional operators delete for vector deleting destructors support.
+
+struct Foo {
+  virtual ~Foo() {} // expected-error {{attempt to use a deleted function}}
+  static void operator delete(void* ptr) = delete; // expected-note {{explicitly marked deleted here}}
+};
+
+
+struct Bar {
+  virtual ~Bar() {}
+  static void operator delete[](void* ptr) = delete;
+};
+
+struct Baz {
+  virtual ~Baz() {}
+  static void operator delete[](void* ptr) = delete; // expected-note {{explicitly marked deleted here}}
+};
+
+struct BarBaz {
+  ~BarBaz() {}
+  static void operator delete[](void* ptr) = delete;
+};
+
+void foobar() {
+  Baz *B = new Baz[10]();
+  delete [] B; // expected-error {{attempt to use a deleted function}}
+  BarBaz *BB = new BarBaz[10]();
+}
+
+struct BaseDelete1 {
+  void operator delete[](void *);
+};
+struct BaseDelete2 {
+  void operator delete[](void *);
+};
+struct BaseDestructor {
+  BaseDestructor() {}
+  virtual ~BaseDestructor() = default;
+};
+struct Final : BaseDelete1, BaseDelete2, BaseDestructor {
+  Final() {}
+};
+struct FinalExplicit : BaseDelete1, BaseDelete2, BaseDestructor {
+  FinalExplicit() {}
+  inline ~FinalExplicit() {}
+};
+
+#ifdef MS
+struct Final1 : BaseDelete1, BaseDelete2, BaseDestructor {
+  __declspec(dllexport) ~Final1() {}
+};
+#endif // MS
+
+void foo() {
+    Final* a = new Final[10]();
+    FinalExplicit* b = new FinalExplicit[10]();
+}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index f00e94aee9847..bcb3ad854c388 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -2516,6 +2516,7 @@ static llvm::StringRef ClangToItaniumDtorKind(clang::CXXDtorType kind) {
   case clang::CXXDtorType::Dtor_Unified:
     return "D4";
   case clang::CXXDtorType::Dtor_Comdat:
+  case clang::CXXDtorType::Dtor_VectorDeleting:
     llvm_unreachable("Unexpected destructor kind.");
   }
   llvm_unreachable("Fully covered switch above");

From 7a53d33e7c29ce3058eb12d2f85ca0a46b0ea363 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 13 Nov 2025 18:35:30 +0900
Subject: [PATCH 19/29] [mlir] Add FP software implementation lowering pass:
 `arith-to-apfloat` (#167848)

Reland pass and fix linker errors.

---------

Co-authored-by: Maksim Levental <maksim.levental@gmail.com>
---
 .../ArithToAPFloat/ArithToAPFloat.h           |  21 +++
 mlir/include/mlir/Conversion/Passes.h         |   1 +
 mlir/include/mlir/Conversion/Passes.td        |  15 ++
 mlir/include/mlir/Dialect/Func/Utils/Utils.h  |   7 +
 .../mlir/Dialect/LLVMIR/FunctionCallUtils.h   |   4 +
 .../ArithToAPFloat/ArithToAPFloat.cpp         | 163 ++++++++++++++++++
 .../Conversion/ArithToAPFloat/CMakeLists.txt  |  18 ++
 .../Conversion/ArithToLLVM/ArithToLLVM.cpp    |   1 +
 mlir/lib/Conversion/CMakeLists.txt            |   1 +
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |  14 ++
 mlir/lib/Dialect/Func/Utils/Utils.cpp         |  25 +++
 .../Dialect/LLVMIR/IR/FunctionCallUtils.cpp   |  11 ++
 mlir/lib/ExecutionEngine/APFloatWrappers.cpp  |  89 ++++++++++
 mlir/lib/ExecutionEngine/CMakeLists.txt       |  21 +++
 .../ArithToApfloat/arith-to-apfloat.mlir      | 128 ++++++++++++++
 .../Arith/CPU/test-apfloat-emulation.mlir     |  40 +++++
 mlir/test/lit.cfg.py                          |   5 +
 17 files changed, 564 insertions(+)
 create mode 100644 mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
 create mode 100644 mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
 create mode 100644 mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
 create mode 100644 mlir/lib/ExecutionEngine/APFloatWrappers.cpp
 create mode 100644 mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
 create mode 100644 mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir

diff --git a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
new file mode 100644
index 0000000000000..64a42a228199e
--- /dev/null
+++ b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
@@ -0,0 +1,21 @@
+//===- ArithToAPFloat.h - Arith to APFloat impl conversion ---*- C++ ----*-===//
+//
+// Part of the APFloat Project, under the Apache License v2.0 with APFloat
+// Exceptions. See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH APFloat-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
+#define MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+#define GEN_PASS_DECL_ARITHTOAPFLOATCONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 40d866ec7bf10..82bdfd02661a6 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -12,6 +12,7 @@
 #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h"
+#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
 #include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h"
 #include "mlir/Conversion/ArithToEmitC/ArithToEmitCPass.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 70e3e45c225db..79bc380dbcb7a 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -186,6 +186,21 @@ def ArithToLLVMConversionPass : Pass<"convert-arith-to-llvm"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// ArithToAPFloat
+//===----------------------------------------------------------------------===//
+
+def ArithToAPFloatConversionPass
+    : Pass<"convert-arith-to-apfloat", "ModuleOp"> {
+  let summary = "Convert Arith ops to APFloat runtime library calls";
+  let description = [{
+    This pass converts supported Arith ops to APFloat-based runtime library
+    calls (APFloatWrappers.cpp). APFloat is a software implementation of
+    floating-point arithmetic operations.
+  }];
+  let dependentDialects = ["func::FuncDialect"];
+}
+
 //===----------------------------------------------------------------------===//
 // ArithToSPIRV
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Func/Utils/Utils.h b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
index 3576126a487ac..00d50874a2e8d 100644
--- a/mlir/include/mlir/Dialect/Func/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
@@ -60,6 +60,13 @@ mlir::FailureOr<std::pair<mlir::func::FuncOp, mlir::func::CallOp>>
 deduplicateArgsOfFuncOp(mlir::RewriterBase &rewriter, mlir::func::FuncOp funcOp,
                         mlir::ModuleOp moduleOp);
 
+/// Look up a FuncOp with signature `resultTypes`(`paramTypes`)` and name
+/// `name`. Return a failure if the FuncOp is found but with a different
+/// signature.
+FailureOr<FuncOp> lookupFnDecl(SymbolOpInterface symTable, StringRef name,
+                               FunctionType funcT,
+                               SymbolTableCollection *symbolTables = nullptr);
+
 } // namespace func
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
index 8ad9ed18acebd..b09d32022e348 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
@@ -52,6 +52,10 @@ lookupOrCreatePrintF32Fn(OpBuilder &b, Operation *moduleOp,
 FailureOr<LLVM::LLVMFuncOp>
 lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
                          SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
+                             SymbolTableCollection *symbolTables = nullptr);
+
 /// Declares a function to print a C-string.
 /// If a custom runtime function is defined via `runtimeFunctionName`, it must
 /// have the signature void(char const*). The default function is `printString`.
diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
new file mode 100644
index 0000000000000..699edb188a70a
--- /dev/null
+++ b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
@@ -0,0 +1,163 @@
+//===- ArithToAPFloat.cpp - Arithmetic to APFloat Conversion --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Utils/Utils.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_ARITHTOAPFLOATCONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::func;
+
+static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable,
+                           StringRef name, FunctionType funcT, bool setPrivate,
+                           SymbolTableCollection *symbolTables = nullptr) {
+  OpBuilder::InsertionGuard g(b);
+  assert(!symTable->getRegion(0).empty() && "expected non-empty region");
+  b.setInsertionPointToStart(&symTable->getRegion(0).front());
+  FuncOp funcOp = FuncOp::create(b, symTable->getLoc(), name, funcT);
+  if (setPrivate)
+    funcOp.setPrivate();
+  if (symbolTables) {
+    SymbolTable &symbolTable = symbolTables->getSymbolTable(symTable);
+    symbolTable.insert(funcOp, symTable->getRegion(0).front().begin());
+  }
+  return funcOp;
+}
+
+/// Helper function to look up or create the symbol for a runtime library
+/// function for a binary arithmetic operation.
+///
+/// Parameter 1: APFloat semantics
+/// Parameter 2: Left-hand side operand
+/// Parameter 3: Right-hand side operand
+///
+/// This function will return a failure if the function is found but has an
+/// unexpected signature.
+///
+static FailureOr<FuncOp>
+lookupOrCreateBinaryFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name,
+                       SymbolTableCollection *symbolTables = nullptr) {
+  auto i32Type = IntegerType::get(symTable->getContext(), 32);
+  auto i64Type = IntegerType::get(symTable->getContext(), 64);
+
+  std::string funcName = (llvm::Twine("_mlir_apfloat_") + name).str();
+  FunctionType funcT =
+      FunctionType::get(b.getContext(), {i32Type, i64Type, i64Type}, {i64Type});
+  FailureOr<FuncOp> func =
+      lookupFnDecl(symTable, funcName, funcT, symbolTables);
+  // Failed due to type mismatch.
+  if (failed(func))
+    return func;
+  // Successfully matched existing decl.
+  if (*func)
+    return *func;
+
+  return createFnDecl(b, symTable, funcName, funcT,
+                      /*setPrivate=*/true, symbolTables);
+}
+
+/// Rewrite a binary arithmetic operation to an APFloat function call.
+template <typename OpTy>
+struct BinaryArithOpToAPFloatConversion final : OpRewritePattern<OpTy> {
+  BinaryArithOpToAPFloatConversion(MLIRContext *context,
+                                   const char *APFloatName,
+                                   SymbolOpInterface symTable,
+                                   PatternBenefit benefit = 1)
+      : OpRewritePattern<OpTy>(context, benefit), symTable(symTable),
+        APFloatName(APFloatName) {};
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    // Get APFloat function from runtime library.
+    FailureOr<FuncOp> fn =
+        lookupOrCreateBinaryFn(rewriter, symTable, APFloatName);
+    if (failed(fn))
+      return fn;
+
+    rewriter.setInsertionPoint(op);
+    // Cast operands to 64-bit integers.
+    Location loc = op.getLoc();
+    auto floatTy = cast<FloatType>(op.getType());
+    auto intWType = rewriter.getIntegerType(floatTy.getWidth());
+    auto int64Type = rewriter.getI64Type();
+    Value lhsBits = arith::ExtUIOp::create(
+        rewriter, loc, int64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getLhs()));
+    Value rhsBits = arith::ExtUIOp::create(
+        rewriter, loc, int64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs()));
+
+    // Call APFloat function.
+    int32_t sem =
+        llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
+    Value semValue = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getI32Type(),
+        rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
+    SmallVector<Value> params = {semValue, lhsBits, rhsBits};
+    auto resultOp =
+        func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()),
+                             SymbolRefAttr::get(*fn), params);
+
+    // Truncate result to the original width.
+    Value truncatedBits = arith::TruncIOp::create(rewriter, loc, intWType,
+                                                  resultOp->getResult(0));
+    rewriter.replaceOp(
+        op, arith::BitcastOp::create(rewriter, loc, floatTy, truncatedBits));
+    return success();
+  }
+
+  SymbolOpInterface symTable;
+  const char *APFloatName;
+};
+
+namespace {
+struct ArithToAPFloatConversionPass final
+    : impl::ArithToAPFloatConversionPassBase<ArithToAPFloatConversionPass> {
+  using Base::Base;
+
+  void runOnOperation() override;
+};
+
+void ArithToAPFloatConversionPass::runOnOperation() {
+  MLIRContext *context = &getContext();
+  RewritePatternSet patterns(context);
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::AddFOp>>(context, "add",
+                                                                getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::SubFOp>>(
+      context, "subtract", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::MulFOp>>(
+      context, "multiply", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::DivFOp>>(
+      context, "divide", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::RemFOp>>(
+      context, "remainder", getOperation());
+  LogicalResult result = success();
+  ScopedDiagnosticHandler scopedHandler(context, [&result](Diagnostic &diag) {
+    if (diag.getSeverity() == DiagnosticSeverity::Error) {
+      result = failure();
+    }
+    // NB: if you don't return failure, no other diag handlers will fire (see
+    // mlir/lib/IR/Diagnostics.cpp:DiagnosticEngineImpl::emit).
+    return failure();
+  });
+  walkAndApplyPatterns(getOperation(), std::move(patterns));
+  if (failed(result))
+    return signalPassFailure();
+}
+} // namespace
diff --git a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
new file mode 100644
index 0000000000000..b5ec49c087163
--- /dev/null
+++ b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_mlir_conversion_library(MLIRArithToAPFloat
+  ArithToAPFloat.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ArithToLLVM
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRArithTransforms
+  MLIRFuncDialect
+  MLIRFuncUtils
+  )
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index b6099902cc337..f2bacc3399144 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/TypeUtilities.h"
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index bebf1b8fff3f9..613dc6d242ceb 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -2,6 +2,7 @@ add_subdirectory(AffineToStandard)
 add_subdirectory(AMDGPUToROCDL)
 add_subdirectory(ArithCommon)
 add_subdirectory(ArithToAMDGPU)
+add_subdirectory(ArithToAPFloat)
 add_subdirectory(ArithToArmSME)
 add_subdirectory(ArithToEmitC)
 add_subdirectory(ArithToLLVM)
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 69a317ecd101f..c747e1b59558a 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1654,6 +1654,20 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
           return failure();
         }
       }
+    } else if (auto floatTy = dyn_cast<FloatType>(printType)) {
+      // Print other floating-point types using the APFloat runtime library.
+      int32_t sem =
+          llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
+      Value semValue = LLVM::ConstantOp::create(
+          rewriter, loc, rewriter.getI32Type(),
+          rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
+      Value floatBits =
+          LLVM::ZExtOp::create(rewriter, loc, rewriter.getI64Type(), value);
+      printer =
+          LLVM::lookupOrCreateApFloatPrintFn(rewriter, parent, symbolTables);
+      emitCall(rewriter, loc, printer.value(),
+               ValueRange({semValue, floatBits}));
+      return success();
     } else {
       return failure();
     }
diff --git a/mlir/lib/Dialect/Func/Utils/Utils.cpp b/mlir/lib/Dialect/Func/Utils/Utils.cpp
index b4cb0932ef631..d6dfd0229963c 100644
--- a/mlir/lib/Dialect/Func/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Func/Utils/Utils.cpp
@@ -254,3 +254,28 @@ func::deduplicateArgsOfFuncOp(RewriterBase &rewriter, func::FuncOp funcOp,
 
   return std::make_pair(*newFuncOpOrFailure, newCallOp);
 }
+
+FailureOr<func::FuncOp>
+func::lookupFnDecl(SymbolOpInterface symTable, StringRef name,
+                   FunctionType funcT, SymbolTableCollection *symbolTables) {
+  FuncOp func;
+  if (symbolTables) {
+    func = symbolTables->lookupSymbolIn<FuncOp>(
+        symTable, StringAttr::get(symTable->getContext(), name));
+  } else {
+    func = llvm::dyn_cast_or_null<FuncOp>(
+        SymbolTable::lookupSymbolIn(symTable, name));
+  }
+
+  if (!func)
+    return func;
+
+  mlir::FunctionType foundFuncT = func.getFunctionType();
+  // Assert the signature of the found function is same as expected
+  if (funcT != foundFuncT) {
+    return func.emitError("matched function '")
+           << name << "' but with different type: " << foundFuncT
+           << " (expected " << funcT << ")";
+  }
+  return func;
+}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index feaffa34897b6..160b6ae89215c 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -30,6 +30,7 @@ static constexpr llvm::StringRef kPrintF16 = "printF16";
 static constexpr llvm::StringRef kPrintBF16 = "printBF16";
 static constexpr llvm::StringRef kPrintF32 = "printF32";
 static constexpr llvm::StringRef kPrintF64 = "printF64";
+static constexpr llvm::StringRef kPrintApFloat = "printApFloat";
 static constexpr llvm::StringRef kPrintString = "printString";
 static constexpr llvm::StringRef kPrintOpen = "printOpen";
 static constexpr llvm::StringRef kPrintClose = "printClose";
@@ -160,6 +161,16 @@ mlir::LLVM::lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
       LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
+                                         SymbolTableCollection *symbolTables) {
+  return lookupOrCreateReservedFn(
+      b, moduleOp, kPrintApFloat,
+      {IntegerType::get(moduleOp->getContext(), 32),
+       IntegerType::get(moduleOp->getContext(), 64)},
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
+}
+
 static LLVM::LLVMPointerType getCharPtr(MLIRContext *context) {
   return LLVM::LLVMPointerType::get(context);
 }
diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
new file mode 100644
index 0000000000000..0a05f7369e556
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
@@ -0,0 +1,89 @@
+//===- APFloatWrappers.cpp - Software Implementation of FP Arithmetics --- ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes the APFloat infrastructure to MLIR programs as a runtime
+// library. APFloat is a software implementation of floating point arithmetics.
+//
+// On the MLIR side, floating-point values must be bitcasted to 64-bit integers
+// before calling a runtime function. If a floating-point type has less than
+// 64 bits, it must be zero-extended to 64 bits after bitcasting it to an
+// integer.
+//
+// Runtime functions receive the floating-point operands of the arithmeic
+// operation in the form of 64-bit integers, along with the APFloat semantics
+// in the form of a 32-bit integer, which will be interpreted as an
+// APFloatBase::Semantics enum value.
+//
+#include "llvm/ADT/APFloat.h"
+
+#ifdef _WIN32
+#ifndef MLIR_APFLOAT_WRAPPERS_EXPORT
+#ifdef mlir_apfloat_wrappers_EXPORTS
+// We are building this library
+#define MLIR_APFLOAT_WRAPPERS_EXPORT __declspec(dllexport)
+#else
+// We are using this library
+#define MLIR_APFLOAT_WRAPPERS_EXPORT __declspec(dllimport)
+#endif // mlir_apfloat_wrappers_EXPORTS
+#endif // MLIR_APFLOAT_WRAPPERS_EXPORT
+#else
+// Non-windows: use visibility attributes.
+#define MLIR_APFLOAT_WRAPPERS_EXPORT __attribute__((visibility("default")))
+#endif // _WIN32
+
+/// Binary operations without rounding mode.
+#define APFLOAT_BINARY_OP(OP)                                                  \
+  MLIR_APFLOAT_WRAPPERS_EXPORT int64_t _mlir_apfloat_##OP(                     \
+      int32_t semantics, uint64_t a, uint64_t b) {                             \
+    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
+        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
+    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
+    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
+    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
+    lhs.OP(rhs);                                                               \
+    return lhs.bitcastToAPInt().getZExtValue();                                \
+  }
+
+/// Binary operations with rounding mode.
+#define APFLOAT_BINARY_OP_ROUNDING_MODE(OP, ROUNDING_MODE)                     \
+  MLIR_APFLOAT_WRAPPERS_EXPORT int64_t _mlir_apfloat_##OP(                     \
+      int32_t semantics, uint64_t a, uint64_t b) {                             \
+    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
+        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
+    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
+    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
+    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
+    lhs.OP(rhs, ROUNDING_MODE);                                                \
+    return lhs.bitcastToAPInt().getZExtValue();                                \
+  }
+
+extern "C" {
+
+#define BIN_OPS_WITH_ROUNDING(X)                                               \
+  X(add, llvm::RoundingMode::NearestTiesToEven)                                \
+  X(subtract, llvm::RoundingMode::NearestTiesToEven)                           \
+  X(multiply, llvm::RoundingMode::NearestTiesToEven)                           \
+  X(divide, llvm::RoundingMode::NearestTiesToEven)
+
+BIN_OPS_WITH_ROUNDING(APFLOAT_BINARY_OP_ROUNDING_MODE)
+#undef BIN_OPS_WITH_ROUNDING
+#undef APFLOAT_BINARY_OP_ROUNDING_MODE
+
+APFLOAT_BINARY_OP(remainder)
+
+#undef APFLOAT_BINARY_OP
+
+MLIR_APFLOAT_WRAPPERS_EXPORT void printApFloat(int32_t semantics, uint64_t a) {
+  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
+      static_cast<llvm::APFloatBase::Semantics>(semantics));
+  unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);
+  llvm::APFloat x(sem, llvm::APInt(bitWidth, a));
+  double d = x.convertToDouble();
+  fprintf(stdout, "%lg", d);
+}
+}
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index fdeb4dacf9278..c813a431270d0 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -2,6 +2,7 @@
 # is a big dependency which most don't need.
 
 set(LLVM_OPTIONAL_SOURCES
+  APFloatWrappers.cpp
   ArmRunnerUtils.cpp
   ArmSMEStubs.cpp
   AsyncRuntime.cpp
@@ -167,6 +168,26 @@ if(LLVM_ENABLE_PIC)
   set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17)
   target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS)
 
+  if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    # TODO: This support library is only used on Linux builds until we figure
+    # out how to hide LLVM symbols in a way that works for all platforms.
+    add_mlir_library(mlir_apfloat_wrappers
+      SHARED
+      APFloatWrappers.cpp
+
+      EXCLUDE_FROM_LIBMLIR
+      )
+    set_target_properties(
+      mlir_apfloat_wrappers
+      PROPERTIES CXX_STANDARD 17
+                CXX_VISIBILITY_PRESET hidden
+                VISIBILITY_INLINES_HIDDEN ON
+    )
+    target_compile_definitions(mlir_apfloat_wrappers PRIVATE mlir_apfloat_wrappers_EXPORTS)
+    # Hide LLVM symbols to avoid ODR violations.
+    target_link_options(mlir_apfloat_wrappers PRIVATE "-Wl,--exclude-libs,ALL")
+  endif()
+
   add_subdirectory(SparseTensor)
 
   add_mlir_library(mlir_c_runner_utils
diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
new file mode 100644
index 0000000000000..797f42c37a26f
--- /dev/null
+++ b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
@@ -0,0 +1,128 @@
+// RUN: mlir-opt %s --convert-arith-to-apfloat -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL:   func.func private @_mlir_apfloat_add(i32, i64, i64) -> i64
+
+// CHECK-LABEL:   func.func @foo() -> f8E4M3FN {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 2.250000e+00 : f8E4M3FN
+// CHECK:           return %[[CONSTANT_0]] : f8E4M3FN
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @bar() -> f6E3M2FN {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 3.000000e+00 : f6E3M2FN
+// CHECK:           return %[[CONSTANT_0]] : f6E3M2FN
+// CHECK:         }
+
+// Illustrate that both f8E4M3FN and f6E3M2FN calling the same _mlir_apfloat_add is fine
+// because each gets its own semantics enum and gets bitcast/extui/trunci to its own width.
+// CHECK-LABEL:   func.func @full_example() {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1.375000e+00 : f8E4M3FN
+// CHECK:           %[[VAL_0:.*]] = call @foo() : () -> f8E4M3FN
+// CHECK:           %[[BITCAST_0:.*]] = arith.bitcast %[[CONSTANT_0]] : f8E4M3FN to i8
+// CHECK:           %[[EXTUI_0:.*]] = arith.extui %[[BITCAST_0]] : i8 to i64
+// CHECK:           %[[BITCAST_1:.*]] = arith.bitcast %[[VAL_0]] : f8E4M3FN to i8
+// CHECK:           %[[EXTUI_1:.*]] = arith.extui %[[BITCAST_1]] : i8 to i64
+//                  // fltSemantics semantics for f8E4M3FN
+// CHECK:           %[[CONSTANT_1:.*]] = arith.constant 10 : i32
+// CHECK:           %[[VAL_1:.*]] = call @_mlir_apfloat_add(%[[CONSTANT_1]], %[[EXTUI_0]], %[[EXTUI_1]]) : (i32, i64, i64) -> i64
+// CHECK:           %[[TRUNCI_0:.*]] = arith.trunci %[[VAL_1]] : i64 to i8
+// CHECK:           %[[BITCAST_2:.*]] = arith.bitcast %[[TRUNCI_0]] : i8 to f8E4M3FN
+// CHECK:           vector.print %[[BITCAST_2]] : f8E4M3FN
+
+// CHECK:           %[[CONSTANT_2:.*]] = arith.constant 2.500000e+00 : f6E3M2FN
+// CHECK:           %[[VAL_2:.*]] = call @bar() : () -> f6E3M2FN
+// CHECK:           %[[BITCAST_3:.*]] = arith.bitcast %[[CONSTANT_2]] : f6E3M2FN to i6
+// CHECK:           %[[EXTUI_2:.*]] = arith.extui %[[BITCAST_3]] : i6 to i64
+// CHECK:           %[[BITCAST_4:.*]] = arith.bitcast %[[VAL_2]] : f6E3M2FN to i6
+// CHECK:           %[[EXTUI_3:.*]] = arith.extui %[[BITCAST_4]] : i6 to i64
+//                  // fltSemantics semantics for f6E3M2FN
+// CHECK:           %[[CONSTANT_3:.*]] = arith.constant 16 : i32
+// CHECK:           %[[VAL_3:.*]] = call @_mlir_apfloat_add(%[[CONSTANT_3]], %[[EXTUI_2]], %[[EXTUI_3]]) : (i32, i64, i64) -> i64
+// CHECK:           %[[TRUNCI_1:.*]] = arith.trunci %[[VAL_3]] : i64 to i6
+// CHECK:           %[[BITCAST_5:.*]] = arith.bitcast %[[TRUNCI_1]] : i6 to f6E3M2FN
+// CHECK:           vector.print %[[BITCAST_5]] : f6E3M2FN
+// CHECK:           return
+// CHECK:         }
+
+// Put rhs into separate function so that it won't be constant-folded.
+func.func @foo() -> f8E4M3FN {
+  %cst = arith.constant 2.2 : f8E4M3FN
+  return %cst : f8E4M3FN
+}
+
+func.func @bar() -> f6E3M2FN {
+  %cst = arith.constant 3.2 : f6E3M2FN
+  return %cst : f6E3M2FN
+}
+
+func.func @full_example() {
+  %a = arith.constant 1.4 : f8E4M3FN
+  %b = func.call @foo() : () -> (f8E4M3FN)
+  %c = arith.addf %a, %b : f8E4M3FN
+  vector.print %c : f8E4M3FN
+
+  %d = arith.constant 2.4 : f6E3M2FN
+  %e = func.call @bar() : () -> (f6E3M2FN)
+  %f = arith.addf %d, %e : f6E3M2FN
+  vector.print %f : f6E3M2FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_add(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_add(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// Test decl collision (different type)
+// expected-error@+1{{matched function '_mlir_apfloat_add' but with different type: '(i32, i32, f32) -> index' (expected '(i32, i64, i64) -> i64')}}
+func.func private @_mlir_apfloat_add(i32, i32, f32) -> index
+func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_subtract(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_subtract(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.subf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_multiply(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_multiply(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.mulf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_divide(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_divide(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.divf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_remainder(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_remainder(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @remf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.remf %arg0, %arg1 : f4E2M1FN
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
new file mode 100644
index 0000000000000..dbaa20346a03a
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
@@ -0,0 +1,40 @@
+// REQUIRES: system-linux
+// TODO: Run only on Linux until we figure out how to build
+// mlir_apfloat_wrappers in a platform-independent way.
+
+// Case 1: All floating-point arithmetics is lowered through APFloat.
+// RUN: mlir-opt %s --convert-arith-to-apfloat --convert-to-llvm | \
+// RUN: mlir-runner -e entry --entry-point-result=void \
+// RUN:             --shared-libs=%mlir_c_runner_utils \
+// RUN:             --shared-libs=%mlir_apfloat_wrappers | FileCheck %s
+
+// Case 2: Only unsupported arithmetics (f8E4M3FN) is lowered through APFloat.
+//         Arithmetics on f32 is lowered directly to LLVM.
+// RUN: mlir-opt %s --convert-to-llvm --convert-arith-to-apfloat \
+// RUN:          --convert-to-llvm --reconcile-unrealized-casts | \
+// RUN: mlir-runner -e entry --entry-point-result=void \
+// RUN:             --shared-libs=%mlir_c_runner_utils \
+// RUN:             --shared-libs=%mlir_apfloat_wrappers | FileCheck %s
+
+// Put rhs into separate function so that it won't be constant-folded.
+func.func @foo() -> (f8E4M3FN, f32) {
+  %cst1 = arith.constant 2.2 : f8E4M3FN
+  %cst2 = arith.constant 2.2 : f32
+  return %cst1, %cst2 : f8E4M3FN, f32
+}
+
+func.func @entry() {
+  %a1 = arith.constant 1.4 : f8E4M3FN
+  %a2 = arith.constant 1.4 : f32
+  %b1, %b2 = func.call @foo() : () -> (f8E4M3FN, f32)
+  %c1 = arith.addf %a1, %b1 : f8E4M3FN  // not supported by LLVM
+  %c2 = arith.addf %a2, %b2 : f32       // supported by LLVM
+
+  // CHECK: 3.5
+  vector.print %c1 : f8E4M3FN
+
+  // CHECK: 3.6
+  vector.print %c2 : f32
+
+  return
+}
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 6ff12d66523f5..7081c51994ec1 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -214,6 +214,11 @@ def find_real_python_interpreter():
     "not",
 ]
 
+if "Linux" in config.host_os:
+    # TODO: Run only on Linux until we figure out how to build
+    # mlir_apfloat_wrappers in a platform-independent way.
+    tools.extend([add_runtime("mlir_apfloat_wrappers")])
+
 if config.enable_vulkan_runner:
     tools.extend([add_runtime("mlir_vulkan_runtime")])
 

From 7e5155a98c90be603cb29fa6cc102df12674b255 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 13 Nov 2025 18:58:01 +0900
Subject: [PATCH 20/29] [mlir] Fix build after #167848 (#167855)

Fix build after #167848.
---
 mlir/lib/ExecutionEngine/CMakeLists.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index c813a431270d0..90024b1c8206e 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -205,6 +205,11 @@ if(LLVM_ENABLE_PIC)
   set_property(TARGET mlir_c_runner_utils PROPERTY CXX_STANDARD 17)
   target_compile_definitions(mlir_c_runner_utils PRIVATE mlir_c_runner_utils_EXPORTS)
 
+  # Conditionally link apfloat wrappers only on Linux.
+  if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    target_link_libraries(mlir_c_runner_utils PUBLIC mlir_apfloat_wrappers)
+  endif()
+
   add_mlir_library(mlir_runner_utils
     SHARED
     RunnerUtils.cpp
@@ -216,6 +221,11 @@ if(LLVM_ENABLE_PIC)
   )
   target_compile_definitions(mlir_runner_utils PRIVATE mlir_runner_utils_EXPORTS)
 
+    # Conditionally link apfloat wrappers only on Linux.
+  if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    target_link_libraries(mlir_runner_utils PUBLIC mlir_apfloat_wrappers)
+  endif()
+
   add_mlir_library(mlir_async_runtime
     SHARED
     AsyncRuntime.cpp

From a25daa33f01b193472b1f74dc3ab49fcf5757329 Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Thu, 13 Nov 2025 15:33:46 +0530
Subject: [PATCH 21/29] [NVPTX] Add TMA non-tensor variant of g2s-cta intrinsic
 (#167508)

This patch adds a TMA intrinsic for Global to
shared::cta copy, which was introduced with ptx86.
Also remove the NoCapture<> annotation from the
pointer arguments to these intrinsics, since the
copy operations are asynchronous in nature.

lit tests are verified with a ptxas from cuda-12.8.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 llvm/docs/NVPTXUsage.rst                      | 26 +++++++++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td        | 22 ++++++---
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      | 19 ++++++++
 .../test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll | 46 +++++++++++++++++++
 4 files changed, 106 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 5ad8f9ab07e40..39f0556aef5a2 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -1334,6 +1334,32 @@ copied and it must be a multiple of 16.
 For more information, refer PTX ISA
 `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_.
 
+'``llvm.nvvm.cp.async.bulk.global.to.shared.cta``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i64 %ch, i1 %flag_ch)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.global.to.shared.cta``' intrinsic
+corresponds to the ``cp.async.bulk.shared::cta.global.*`` family
+of PTX instructions. These instructions initiate an asynchronous
+copy of bulk data from global memory to shared::cta memory.
+The 32-bit operand ``%size`` specifies the amount of memory to be
+copied and it must be a multiple of 16. The last argument
+(denoted by ``i1 %flag_ch``) is a compile-time constant. When set,
+it indicates a valid cache_hint (``i64 %ch``) and generates the
+``.L2::cache_hint`` variant of the PTX instruction.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_.
+
 '``llvm.nvvm.cp.async.bulk.shared.cta.to.global``'
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2710853e17688..21badc2692037 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2716,8 +2716,19 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster
       [llvm_i1_ty,                 // Flag for cta_mask
        llvm_i1_ty],                // Flag for cache_hint
       [IntrConvergent, IntrArgMemOnly,
-       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
-       NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, NoCapture<ArgIndex<2>>]>;
+       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>]>;
+
+// From Global to Shared CTA
+def int_nvvm_cp_async_bulk_global_to_shared_cta
+  : DefaultAttrsIntrinsicFlags<[],
+      [llvm_shared_ptr_ty, // dst_shared_cta_ptr
+       llvm_shared_ptr_ty, // mbarrier_ptr
+       llvm_global_ptr_ty, // src_gmem_ptr
+       llvm_i32_ty,        // copy_size
+       llvm_i64_ty],       // cache_hint
+      [llvm_i1_ty],        // Flag for cache_hint
+      [IntrConvergent, IntrArgMemOnly,
+       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>]>;
 
 // From Shared CTA to Shared Cluster
 def int_nvvm_cp_async_bulk_shared_cta_to_cluster
@@ -2727,9 +2738,7 @@ def int_nvvm_cp_async_bulk_shared_cta_to_cluster
        llvm_shared_ptr_ty,         // src_smem_ptr
        llvm_i32_ty],               // copy_size
       [IntrConvergent, IntrArgMemOnly,
-       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
-       NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
-       NoCapture<ArgIndex<2>>]>;
+       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>]>;
 
 // From Shared CTA to Global memory
 def int_nvvm_cp_async_bulk_shared_cta_to_global
@@ -2740,8 +2749,7 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global
        llvm_i64_ty],       // cache_hint
       [llvm_i1_ty],        // Flag for cache_hint
       [IntrConvergent, IntrArgMemOnly,
-       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
-       NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>;
 
 // From Shared CTA to Global memory with bytemask
 def int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 50827bd548ad5..ea69a54e6db37 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -497,6 +497,10 @@ class CpAsyncBulkStr<bit mc, bit ch, bit mask = 0> {
                # !if(mc, ".multicast::cluster", "")
                # !if(ch, ".L2::cache_hint", "");
 
+  // Global to Shared CTA memory
+  string G2S_CTA = "cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes"
+                   # !if(ch, ".L2::cache_hint", "");
+
   // Shared CTA to Cluster memory
   string C2C = "cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes";
 }
@@ -543,6 +547,21 @@ multiclass CP_ASYNC_BULK_G2S_INTR<bit has_ch> {
 defm CP_ASYNC_BULK_G2S    : CP_ASYNC_BULK_G2S_INTR<has_ch = 0>;
 defm CP_ASYNC_BULK_G2S_CH : CP_ASYNC_BULK_G2S_INTR<has_ch = 1>;
 
+multiclass CP_ASYNC_BULK_G2S_CTA_INTR<bit has_ch> {
+  defvar Intr = int_nvvm_cp_async_bulk_global_to_shared_cta;
+
+  def "" : NVPTXInst<(outs),
+      (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
+           B32:$size, B64:$ch),
+      !if(has_ch,
+          CpAsyncBulkStr<0, 1>.G2S_CTA # " [$dst], [$src], $size, [$mbar], $ch;",
+          CpAsyncBulkStr<0, 0>.G2S_CTA # " [$dst], [$src], $size, [$mbar];"),
+      [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>,
+      Requires<[hasPTX<86>, hasSM<90>]>;
+}
+defm CP_ASYNC_BULK_G2S_CTA    : CP_ASYNC_BULK_G2S_CTA_INTR<has_ch = 0>;
+defm CP_ASYNC_BULK_G2S_CTA_CH : CP_ASYNC_BULK_G2S_CTA_INTR<has_ch = 1>;
+
 def CP_ASYNC_BULK_CTA_TO_CLUSTER : NVPTXInst<(outs),
   (ins ADDR:$dst, ADDR:$mbar, ADDR:$src, B32:$size),
   CpAsyncBulkStr<0, 0>.C2C # " [$dst], [$src], $size, [$mbar];",
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll
new file mode 100644
index 0000000000000..9872b2aa0826b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| FileCheck --check-prefixes=CHECK,CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i64, i1)
+
+define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_g2s(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_g2s_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_g2s_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_g2s_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_g2s_param_4];
+; CHECK-PTX64-NEXT:    cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2];
+; CHECK-PTX64-NEXT:    cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_g2s(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_g2s_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_g2s_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_g2s_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_g2s_param_4];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i64 %ch, i1 0)
+  tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i64 %ch, i1 1)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}

From ef9a02ce028782684f9a43dcda756804635ba86a Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Thu, 13 Nov 2025 13:26:58 +0300
Subject: [PATCH 22/29] [CodeGen] Use VirtRegOrUnit where appropriate (NFCI)
 (#167730)

Use it in `printVRegOrUnit()`, `getPressureSets()`/`PSetIterator`,
and in functions/classes dealing with register pressure.

Static type checking revealed several bugs, mainly in MachinePipeliner.
I'm not very familiar with this pass, so I left a bunch of FIXMEs.

There is one bug in `findUseBetween()` in RegisterPressure.cpp, also
annotated with a FIXME.
---
 .../llvm/CodeGen/MachineRegisterInfo.h        |  22 +-
 llvm/include/llvm/CodeGen/Register.h          |   4 +
 llvm/include/llvm/CodeGen/RegisterPressure.h  |  51 ++--
 .../include/llvm/CodeGen/TargetRegisterInfo.h |   2 +-
 llvm/lib/CodeGen/MachinePipeliner.cpp         |  66 +++--
 llvm/lib/CodeGen/MachineScheduler.cpp         |  14 +-
 llvm/lib/CodeGen/RegisterPressure.cpp         | 276 +++++++++---------
 llvm/lib/CodeGen/TargetRegisterInfo.cpp       |  11 +-
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp     |  22 +-
 llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp |  63 ++--
 llvm/lib/Target/AMDGPU/SIMachineScheduler.h   |  14 +-
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp    |  23 +-
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp      |   6 +-
 13 files changed, 313 insertions(+), 261 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 6982dae4718d1..737b74ef3f761 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -634,10 +634,9 @@ class MachineRegisterInfo {
   /// function. Writing to a constant register has no effect.
   LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const;
 
-  /// Get an iterator over the pressure sets affected by the given physical or
-  /// virtual register. If RegUnit is physical, it must be a register unit (from
-  /// MCRegUnitIterator).
-  PSetIterator getPressureSets(Register RegUnit) const;
+  /// Get an iterator over the pressure sets affected by the virtual register
+  /// or register unit.
+  PSetIterator getPressureSets(VirtRegOrUnit VRegOrUnit) const;
 
   //===--------------------------------------------------------------------===//
   // Virtual Register Info
@@ -1249,15 +1248,16 @@ class PSetIterator {
 public:
   PSetIterator() = default;
 
-  PSetIterator(Register RegUnit, const MachineRegisterInfo *MRI) {
+  PSetIterator(VirtRegOrUnit VRegOrUnit, const MachineRegisterInfo *MRI) {
     const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
-    if (RegUnit.isVirtual()) {
-      const TargetRegisterClass *RC = MRI->getRegClass(RegUnit);
+    if (VRegOrUnit.isVirtualReg()) {
+      const TargetRegisterClass *RC =
+          MRI->getRegClass(VRegOrUnit.asVirtualReg());
       PSet = TRI->getRegClassPressureSets(RC);
       Weight = TRI->getRegClassWeight(RC).RegWeight;
     } else {
-      PSet = TRI->getRegUnitPressureSets(RegUnit);
-      Weight = TRI->getRegUnitWeight(RegUnit);
+      PSet = TRI->getRegUnitPressureSets(VRegOrUnit.asMCRegUnit());
+      Weight = TRI->getRegUnitWeight(VRegOrUnit.asMCRegUnit());
     }
     if (*PSet == -1)
       PSet = nullptr;
@@ -1278,8 +1278,8 @@ class PSetIterator {
 };
 
 inline PSetIterator
-MachineRegisterInfo::getPressureSets(Register RegUnit) const {
-  return PSetIterator(RegUnit, this);
+MachineRegisterInfo::getPressureSets(VirtRegOrUnit VRegOrUnit) const {
+  return PSetIterator(VRegOrUnit, this);
 }
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/Register.h b/llvm/include/llvm/CodeGen/Register.h
index 790db8a11e390..5e1e12942a019 100644
--- a/llvm/include/llvm/CodeGen/Register.h
+++ b/llvm/include/llvm/CodeGen/Register.h
@@ -206,6 +206,10 @@ class VirtRegOrUnit {
   constexpr bool operator==(const VirtRegOrUnit &Other) const {
     return VRegOrUnit == Other.VRegOrUnit;
   }
+
+  constexpr bool operator<(const VirtRegOrUnit &Other) const {
+    return VRegOrUnit < Other.VRegOrUnit;
+  }
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/RegisterPressure.h b/llvm/include/llvm/CodeGen/RegisterPressure.h
index 261e5b0d73281..20a7e4fa2e9de 100644
--- a/llvm/include/llvm/CodeGen/RegisterPressure.h
+++ b/llvm/include/llvm/CodeGen/RegisterPressure.h
@@ -37,11 +37,11 @@ class MachineRegisterInfo;
 class RegisterClassInfo;
 
 struct VRegMaskOrUnit {
-  Register RegUnit; ///< Virtual register or register unit.
+  VirtRegOrUnit VRegOrUnit;
   LaneBitmask LaneMask;
 
-  VRegMaskOrUnit(Register RegUnit, LaneBitmask LaneMask)
-      : RegUnit(RegUnit), LaneMask(LaneMask) {}
+  VRegMaskOrUnit(VirtRegOrUnit VRegOrUnit, LaneBitmask LaneMask)
+      : VRegOrUnit(VRegOrUnit), LaneMask(LaneMask) {}
 };
 
 /// Base class for register pressure results.
@@ -157,7 +157,7 @@ class PressureDiff {
   const_iterator begin() const { return &PressureChanges[0]; }
   const_iterator end() const { return &PressureChanges[MaxPSets]; }
 
-  LLVM_ABI void addPressureChange(Register RegUnit, bool IsDec,
+  LLVM_ABI void addPressureChange(VirtRegOrUnit VRegOrUnit, bool IsDec,
                                   const MachineRegisterInfo *MRI);
 
   LLVM_ABI void dump(const TargetRegisterInfo &TRI) const;
@@ -279,25 +279,25 @@ class LiveRegSet {
   RegSet Regs;
   unsigned NumRegUnits = 0u;
 
-  unsigned getSparseIndexFromReg(Register Reg) const {
-    if (Reg.isVirtual())
-      return Reg.virtRegIndex() + NumRegUnits;
-    assert(Reg < NumRegUnits);
-    return Reg.id();
+  unsigned getSparseIndexFromVirtRegOrUnit(VirtRegOrUnit VRegOrUnit) const {
+    if (VRegOrUnit.isVirtualReg())
+      return VRegOrUnit.asVirtualReg().virtRegIndex() + NumRegUnits;
+    assert(VRegOrUnit.asMCRegUnit() < NumRegUnits);
+    return VRegOrUnit.asMCRegUnit();
   }
 
-  Register getRegFromSparseIndex(unsigned SparseIndex) const {
+  VirtRegOrUnit getVirtRegOrUnitFromSparseIndex(unsigned SparseIndex) const {
     if (SparseIndex >= NumRegUnits)
-      return Register::index2VirtReg(SparseIndex - NumRegUnits);
-    return Register(SparseIndex);
+      return VirtRegOrUnit(Register::index2VirtReg(SparseIndex - NumRegUnits));
+    return VirtRegOrUnit(SparseIndex);
   }
 
 public:
   LLVM_ABI void clear();
   LLVM_ABI void init(const MachineRegisterInfo &MRI);
 
-  LaneBitmask contains(Register Reg) const {
-    unsigned SparseIndex = getSparseIndexFromReg(Reg);
+  LaneBitmask contains(VirtRegOrUnit VRegOrUnit) const {
+    unsigned SparseIndex = getSparseIndexFromVirtRegOrUnit(VRegOrUnit);
     RegSet::const_iterator I = Regs.find(SparseIndex);
     if (I == Regs.end())
       return LaneBitmask::getNone();
@@ -307,7 +307,7 @@ class LiveRegSet {
   /// Mark the \p Pair.LaneMask lanes of \p Pair.Reg as live.
   /// Returns the previously live lanes of \p Pair.Reg.
   LaneBitmask insert(VRegMaskOrUnit Pair) {
-    unsigned SparseIndex = getSparseIndexFromReg(Pair.RegUnit);
+    unsigned SparseIndex = getSparseIndexFromVirtRegOrUnit(Pair.VRegOrUnit);
     auto InsertRes = Regs.insert(IndexMaskPair(SparseIndex, Pair.LaneMask));
     if (!InsertRes.second) {
       LaneBitmask PrevMask = InsertRes.first->LaneMask;
@@ -320,7 +320,7 @@ class LiveRegSet {
   /// Clears the \p Pair.LaneMask lanes of \p Pair.Reg (mark them as dead).
   /// Returns the previously live lanes of \p Pair.Reg.
   LaneBitmask erase(VRegMaskOrUnit Pair) {
-    unsigned SparseIndex = getSparseIndexFromReg(Pair.RegUnit);
+    unsigned SparseIndex = getSparseIndexFromVirtRegOrUnit(Pair.VRegOrUnit);
     RegSet::iterator I = Regs.find(SparseIndex);
     if (I == Regs.end())
       return LaneBitmask::getNone();
@@ -335,9 +335,9 @@ class LiveRegSet {
 
   void appendTo(SmallVectorImpl<VRegMaskOrUnit> &To) const {
     for (const IndexMaskPair &P : Regs) {
-      Register Reg = getRegFromSparseIndex(P.Index);
+      VirtRegOrUnit VRegOrUnit = getVirtRegOrUnitFromSparseIndex(P.Index);
       if (P.LaneMask.any())
-        To.emplace_back(Reg, P.LaneMask);
+        To.emplace_back(VRegOrUnit, P.LaneMask);
     }
   }
 };
@@ -541,9 +541,11 @@ class RegPressureTracker {
 
   LLVM_ABI void dump() const;
 
-  LLVM_ABI void increaseRegPressure(Register RegUnit, LaneBitmask PreviousMask,
+  LLVM_ABI void increaseRegPressure(VirtRegOrUnit VRegOrUnit,
+                                    LaneBitmask PreviousMask,
                                     LaneBitmask NewMask);
-  LLVM_ABI void decreaseRegPressure(Register RegUnit, LaneBitmask PreviousMask,
+  LLVM_ABI void decreaseRegPressure(VirtRegOrUnit VRegOrUnit,
+                                    LaneBitmask PreviousMask,
                                     LaneBitmask NewMask);
 
 protected:
@@ -565,9 +567,12 @@ class RegPressureTracker {
   discoverLiveInOrOut(VRegMaskOrUnit Pair,
                       SmallVectorImpl<VRegMaskOrUnit> &LiveInOrOut);
 
-  LLVM_ABI LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const;
-  LLVM_ABI LaneBitmask getLiveLanesAt(Register RegUnit, SlotIndex Pos) const;
-  LLVM_ABI LaneBitmask getLiveThroughAt(Register RegUnit, SlotIndex Pos) const;
+  LLVM_ABI LaneBitmask getLastUsedLanes(VirtRegOrUnit VRegOrUnit,
+                                        SlotIndex Pos) const;
+  LLVM_ABI LaneBitmask getLiveLanesAt(VirtRegOrUnit VRegOrUnit,
+                                      SlotIndex Pos) const;
+  LLVM_ABI LaneBitmask getLiveThroughAt(VirtRegOrUnit VRegOrUnit,
+                                        SlotIndex Pos) const;
 };
 
 LLVM_ABI void dumpRegSetPressure(ArrayRef<unsigned> SetPressure,
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index dabf0dc5ec173..35b14e8b8fd30 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1450,7 +1450,7 @@ LLVM_ABI Printable printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI);
 
 /// Create Printable object to print virtual registers and physical
 /// registers on a \ref raw_ostream.
-LLVM_ABI Printable printVRegOrUnit(unsigned VRegOrUnit,
+LLVM_ABI Printable printVRegOrUnit(VirtRegOrUnit VRegOrUnit,
                                    const TargetRegisterInfo *TRI);
 
 /// Create Printable object to print register classes or register banks
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index a717d9e4a618d..e2f7dfc5cadd5 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1509,7 +1509,11 @@ class HighRegisterPressureDetector {
 
   void dumpPSet(Register Reg) const {
     dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
-    for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
+    // FIXME: The static_cast is a bug compensating bugs in the callers.
+    VirtRegOrUnit VRegOrUnit =
+        Reg.isVirtual() ? VirtRegOrUnit(Reg)
+                        : VirtRegOrUnit(static_cast<MCRegUnit>(Reg.id()));
+    for (auto PSetIter = MRI.getPressureSets(VRegOrUnit); PSetIter.isValid();
          ++PSetIter) {
       dbgs() << *PSetIter << ' ';
     }
@@ -1518,7 +1522,11 @@ class HighRegisterPressureDetector {
 
   void increaseRegisterPressure(std::vector<unsigned> &Pressure,
                                 Register Reg) const {
-    auto PSetIter = MRI.getPressureSets(Reg);
+    // FIXME: The static_cast is a bug compensating bugs in the callers.
+    VirtRegOrUnit VRegOrUnit =
+        Reg.isVirtual() ? VirtRegOrUnit(Reg)
+                        : VirtRegOrUnit(static_cast<MCRegUnit>(Reg.id()));
+    auto PSetIter = MRI.getPressureSets(VRegOrUnit);
     unsigned Weight = PSetIter.getWeight();
     for (; PSetIter.isValid(); ++PSetIter)
       Pressure[*PSetIter] += Weight;
@@ -1526,7 +1534,7 @@ class HighRegisterPressureDetector {
 
   void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
                                 Register Reg) const {
-    auto PSetIter = MRI.getPressureSets(Reg);
+    auto PSetIter = MRI.getPressureSets(VirtRegOrUnit(Reg));
     unsigned Weight = PSetIter.getWeight();
     for (; PSetIter.isValid(); ++PSetIter) {
       auto &P = Pressure[*PSetIter];
@@ -1559,7 +1567,11 @@ class HighRegisterPressureDetector {
       if (MI.isDebugInstr())
         continue;
       for (auto &Use : ROMap[&MI].Uses) {
-        auto Reg = Use.RegUnit;
+        // FIXME: The static_cast is a bug.
+        Register Reg =
+            Use.VRegOrUnit.isVirtualReg()
+                ? Use.VRegOrUnit.asVirtualReg()
+                : Register(static_cast<unsigned>(Use.VRegOrUnit.asMCRegUnit()));
         // Ignore the variable that appears only on one side of phi instruction
         // because it's used only at the first iteration.
         if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
@@ -1609,8 +1621,14 @@ class HighRegisterPressureDetector {
         Register Reg = getLoopPhiReg(*MI, OrigMBB);
         UpdateTargetRegs(Reg);
       } else {
-        for (auto &Use : ROMap.find(MI)->getSecond().Uses)
-          UpdateTargetRegs(Use.RegUnit);
+        for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
+          // FIXME: The static_cast is a bug.
+          Register Reg = Use.VRegOrUnit.isVirtualReg()
+                             ? Use.VRegOrUnit.asVirtualReg()
+                             : Register(static_cast<unsigned>(
+                                   Use.VRegOrUnit.asMCRegUnit()));
+          UpdateTargetRegs(Reg);
+        }
       }
     }
 
@@ -1621,7 +1639,11 @@ class HighRegisterPressureDetector {
     DenseMap<Register, MachineInstr *> LastUseMI;
     for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
       for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
-        auto Reg = Use.RegUnit;
+        // FIXME: The static_cast is a bug.
+        Register Reg =
+            Use.VRegOrUnit.isVirtualReg()
+                ? Use.VRegOrUnit.asVirtualReg()
+                : Register(static_cast<unsigned>(Use.VRegOrUnit.asMCRegUnit()));
         if (!TargetRegs.contains(Reg))
           continue;
         auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
@@ -1635,8 +1657,8 @@ class HighRegisterPressureDetector {
     }
 
     Instr2LastUsesTy LastUses;
-    for (auto &Entry : LastUseMI)
-      LastUses[Entry.second].insert(Entry.first);
+    for (auto [Reg, MI] : LastUseMI)
+      LastUses[MI].insert(Reg);
     return LastUses;
   }
 
@@ -1675,7 +1697,12 @@ class HighRegisterPressureDetector {
     });
 
     const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
-                                                   Register Reg) {
+                                                   VirtRegOrUnit VRegOrUnit) {
+      // FIXME: The static_cast is a bug.
+      Register Reg =
+          VRegOrUnit.isVirtualReg()
+              ? VRegOrUnit.asVirtualReg()
+              : Register(static_cast<unsigned>(VRegOrUnit.asMCRegUnit()));
       if (!Reg.isValid() || isReservedRegister(Reg))
         return;
 
@@ -1712,7 +1739,7 @@ class HighRegisterPressureDetector {
         const unsigned Iter = I - Stage;
 
         for (auto &Def : ROMap.find(MI)->getSecond().Defs)
-          InsertReg(LiveRegSets[Iter], Def.RegUnit);
+          InsertReg(LiveRegSets[Iter], Def.VRegOrUnit);
 
         for (auto LastUse : LastUses[MI]) {
           if (MI->isPHI()) {
@@ -2235,7 +2262,7 @@ static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SmallVector<VRegMaskOrUnit, 8> LiveOutRegs;
-  SmallSet<Register, 4> Uses;
+  SmallSet<VirtRegOrUnit, 4> Uses;
   for (SUnit *SU : NS) {
     const MachineInstr *MI = SU->getInstr();
     if (MI->isPHI())
@@ -2243,9 +2270,10 @@ static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
     for (const MachineOperand &MO : MI->all_uses()) {
       Register Reg = MO.getReg();
       if (Reg.isVirtual())
-        Uses.insert(Reg);
+        Uses.insert(VirtRegOrUnit(Reg));
       else if (MRI.isAllocatable(Reg))
-        Uses.insert_range(TRI->regunits(Reg.asMCReg()));
+        for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
+          Uses.insert(VirtRegOrUnit(Unit));
     }
   }
   for (SUnit *SU : NS)
@@ -2253,12 +2281,14 @@ static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
       if (!MO.isDead()) {
         Register Reg = MO.getReg();
         if (Reg.isVirtual()) {
-          if (!Uses.count(Reg))
-            LiveOutRegs.emplace_back(Reg, LaneBitmask::getNone());
+          if (!Uses.count(VirtRegOrUnit(Reg)))
+            LiveOutRegs.emplace_back(VirtRegOrUnit(Reg),
+                                     LaneBitmask::getNone());
         } else if (MRI.isAllocatable(Reg)) {
           for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
-            if (!Uses.count(Unit))
-              LiveOutRegs.emplace_back(Unit, LaneBitmask::getNone());
+            if (!Uses.count(VirtRegOrUnit(Unit)))
+              LiveOutRegs.emplace_back(VirtRegOrUnit(Unit),
+                                       LaneBitmask::getNone());
         }
       }
   RPTracker.addLiveRegs(LiveOutRegs);
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 73993705c4a7b..de29a9fab876e 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1580,10 +1580,10 @@ updateScheduledPressure(const SUnit *SU,
 /// instruction.
 void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<VRegMaskOrUnit> LiveUses) {
   for (const VRegMaskOrUnit &P : LiveUses) {
-    Register Reg = P.RegUnit;
     /// FIXME: Currently assuming single-use physregs.
-    if (!Reg.isVirtual())
+    if (!P.VRegOrUnit.isVirtualReg())
       continue;
+    Register Reg = P.VRegOrUnit.asVirtualReg();
 
     if (ShouldTrackLaneMasks) {
       // If the register has just become live then other uses won't change
@@ -1599,7 +1599,7 @@ void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<VRegMaskOrUnit> LiveUses) {
           continue;
 
         PressureDiff &PDiff = getPressureDiff(&SU);
-        PDiff.addPressureChange(Reg, Decrement, &MRI);
+        PDiff.addPressureChange(VirtRegOrUnit(Reg), Decrement, &MRI);
         if (llvm::any_of(PDiff, [](const PressureChange &Change) {
               return Change.isValid();
             }))
@@ -1611,7 +1611,7 @@ void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<VRegMaskOrUnit> LiveUses) {
       }
     } else {
       assert(P.LaneMask.any());
-      LLVM_DEBUG(dbgs() << "  LiveReg: " << printVRegOrUnit(Reg, TRI) << "\n");
+      LLVM_DEBUG(dbgs() << "  LiveReg: " << printReg(Reg, TRI) << "\n");
       // This may be called before CurrentBottom has been initialized. However,
       // BotRPTracker must have a valid position. We want the value live into the
       // instruction or live out of the block, so ask for the previous
@@ -1638,7 +1638,7 @@ void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<VRegMaskOrUnit> LiveUses) {
               LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
           if (LRQ.valueIn() == VNI) {
             PressureDiff &PDiff = getPressureDiff(SU);
-            PDiff.addPressureChange(Reg, true, &MRI);
+            PDiff.addPressureChange(VirtRegOrUnit(Reg), true, &MRI);
             if (llvm::any_of(PDiff, [](const PressureChange &Change) {
                   return Change.isValid();
                 }))
@@ -1814,9 +1814,9 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
   unsigned MaxCyclicLatency = 0;
   // Visit each live out vreg def to find def/use pairs that cross iterations.
   for (const VRegMaskOrUnit &P : RPTracker.getPressure().LiveOutRegs) {
-    Register Reg = P.RegUnit;
-    if (!Reg.isVirtual())
+    if (!P.VRegOrUnit.isVirtualReg())
       continue;
+    Register Reg = P.VRegOrUnit.asVirtualReg();
     const LiveInterval &LI = LIS->getInterval(Reg);
     const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
     if (!DefVNI)
diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp
index 7d4674b3f74f0..cd431bc7a171c 100644
--- a/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -47,13 +47,14 @@ using namespace llvm;
 
 /// Increase pressure for each pressure set provided by TargetRegisterInfo.
 static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
-                                const MachineRegisterInfo &MRI, unsigned Reg,
-                                LaneBitmask PrevMask, LaneBitmask NewMask) {
+                                const MachineRegisterInfo &MRI,
+                                VirtRegOrUnit VRegOrUnit, LaneBitmask PrevMask,
+                                LaneBitmask NewMask) {
   assert((PrevMask & ~NewMask).none() && "Must not remove bits");
   if (PrevMask.any() || NewMask.none())
     return;
 
-  PSetIterator PSetI = MRI.getPressureSets(Reg);
+  PSetIterator PSetI = MRI.getPressureSets(VRegOrUnit);
   unsigned Weight = PSetI.getWeight();
   for (; PSetI.isValid(); ++PSetI)
     CurrSetPressure[*PSetI] += Weight;
@@ -61,13 +62,14 @@ static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
 
 /// Decrease pressure for each pressure set provided by TargetRegisterInfo.
 static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
-                                const MachineRegisterInfo &MRI, Register Reg,
-                                LaneBitmask PrevMask, LaneBitmask NewMask) {
+                                const MachineRegisterInfo &MRI,
+                                VirtRegOrUnit VRegOrUnit, LaneBitmask PrevMask,
+                                LaneBitmask NewMask) {
   assert((NewMask & ~PrevMask).none() && "Must not add bits");
   if (NewMask.any() || PrevMask.none())
     return;
 
-  PSetIterator PSetI = MRI.getPressureSets(Reg);
+  PSetIterator PSetI = MRI.getPressureSets(VRegOrUnit);
   unsigned Weight = PSetI.getWeight();
   for (; PSetI.isValid(); ++PSetI) {
     assert(CurrSetPressure[*PSetI] >= Weight && "register pressure underflow");
@@ -93,7 +95,7 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
   dumpRegSetPressure(MaxSetPressure, TRI);
   dbgs() << "Live In: ";
   for (const VRegMaskOrUnit &P : LiveInRegs) {
-    dbgs() << printVRegOrUnit(P.RegUnit, TRI);
+    dbgs() << printVRegOrUnit(P.VRegOrUnit, TRI);
     if (!P.LaneMask.all())
       dbgs() << ':' << PrintLaneMask(P.LaneMask);
     dbgs() << ' ';
@@ -101,7 +103,7 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
   dbgs() << '\n';
   dbgs() << "Live Out: ";
   for (const VRegMaskOrUnit &P : LiveOutRegs) {
-    dbgs() << printVRegOrUnit(P.RegUnit, TRI);
+    dbgs() << printVRegOrUnit(P.VRegOrUnit, TRI);
     if (!P.LaneMask.all())
       dbgs() << ':' << PrintLaneMask(P.LaneMask);
     dbgs() << ' ';
@@ -148,13 +150,13 @@ void RegPressureDelta::dump() const {
 
 #endif
 
-void RegPressureTracker::increaseRegPressure(Register RegUnit,
+void RegPressureTracker::increaseRegPressure(VirtRegOrUnit VRegOrUnit,
                                              LaneBitmask PreviousMask,
                                              LaneBitmask NewMask) {
   if (PreviousMask.any() || NewMask.none())
     return;
 
-  PSetIterator PSetI = MRI->getPressureSets(RegUnit);
+  PSetIterator PSetI = MRI->getPressureSets(VRegOrUnit);
   unsigned Weight = PSetI.getWeight();
   for (; PSetI.isValid(); ++PSetI) {
     CurrSetPressure[*PSetI] += Weight;
@@ -163,10 +165,10 @@ void RegPressureTracker::increaseRegPressure(Register RegUnit,
   }
 }
 
-void RegPressureTracker::decreaseRegPressure(Register RegUnit,
+void RegPressureTracker::decreaseRegPressure(VirtRegOrUnit VRegOrUnit,
                                              LaneBitmask PreviousMask,
                                              LaneBitmask NewMask) {
-  decreaseSetPressure(CurrSetPressure, *MRI, RegUnit, PreviousMask, NewMask);
+  decreaseSetPressure(CurrSetPressure, *MRI, VRegOrUnit, PreviousMask, NewMask);
 }
 
 /// Clear the result so it can be used for another round of pressure tracking.
@@ -230,10 +232,11 @@ void LiveRegSet::clear() {
   Regs.clear();
 }
 
-static const LiveRange *getLiveRange(const LiveIntervals &LIS, unsigned Reg) {
-  if (Register::isVirtualRegister(Reg))
-    return &LIS.getInterval(Reg);
-  return LIS.getCachedRegUnit(Reg);
+static const LiveRange *getLiveRange(const LiveIntervals &LIS,
+                                     VirtRegOrUnit VRegOrUnit) {
+  if (VRegOrUnit.isVirtualReg())
+    return &LIS.getInterval(VRegOrUnit.asVirtualReg());
+  return LIS.getCachedRegUnit(VRegOrUnit.asMCRegUnit());
 }
 
 void RegPressureTracker::reset() {
@@ -356,17 +359,18 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
   LiveThruPressure.assign(TRI->getNumRegPressureSets(), 0);
   assert(isBottomClosed() && "need bottom-up tracking to intialize.");
   for (const VRegMaskOrUnit &Pair : P.LiveOutRegs) {
-    Register RegUnit = Pair.RegUnit;
-    if (RegUnit.isVirtual() && !RPTracker.hasUntiedDef(RegUnit))
-      increaseSetPressure(LiveThruPressure, *MRI, RegUnit,
+    VirtRegOrUnit VRegOrUnit = Pair.VRegOrUnit;
+    if (VRegOrUnit.isVirtualReg() &&
+        !RPTracker.hasUntiedDef(VRegOrUnit.asVirtualReg()))
+      increaseSetPressure(LiveThruPressure, *MRI, VRegOrUnit,
                           LaneBitmask::getNone(), Pair.LaneMask);
   }
 }
 
 static LaneBitmask getRegLanes(ArrayRef<VRegMaskOrUnit> RegUnits,
-                               Register RegUnit) {
-  auto I = llvm::find_if(RegUnits, [RegUnit](const VRegMaskOrUnit Other) {
-    return Other.RegUnit == RegUnit;
+                               VirtRegOrUnit VRegOrUnit) {
+  auto I = llvm::find_if(RegUnits, [VRegOrUnit](const VRegMaskOrUnit Other) {
+    return Other.VRegOrUnit == VRegOrUnit;
   });
   if (I == RegUnits.end())
     return LaneBitmask::getNone();
@@ -375,10 +379,10 @@ static LaneBitmask getRegLanes(ArrayRef<VRegMaskOrUnit> RegUnits,
 
 static void addRegLanes(SmallVectorImpl<VRegMaskOrUnit> &RegUnits,
                         VRegMaskOrUnit Pair) {
-  Register RegUnit = Pair.RegUnit;
+  VirtRegOrUnit VRegOrUnit = Pair.VRegOrUnit;
   assert(Pair.LaneMask.any());
-  auto I = llvm::find_if(RegUnits, [RegUnit](const VRegMaskOrUnit Other) {
-    return Other.RegUnit == RegUnit;
+  auto I = llvm::find_if(RegUnits, [VRegOrUnit](const VRegMaskOrUnit Other) {
+    return Other.VRegOrUnit == VRegOrUnit;
   });
   if (I == RegUnits.end()) {
     RegUnits.push_back(Pair);
@@ -388,12 +392,12 @@ static void addRegLanes(SmallVectorImpl<VRegMaskOrUnit> &RegUnits,
 }
 
 static void setRegZero(SmallVectorImpl<VRegMaskOrUnit> &RegUnits,
-                       Register RegUnit) {
-  auto I = llvm::find_if(RegUnits, [RegUnit](const VRegMaskOrUnit Other) {
-    return Other.RegUnit == RegUnit;
+                       VirtRegOrUnit VRegOrUnit) {
+  auto I = llvm::find_if(RegUnits, [VRegOrUnit](const VRegMaskOrUnit Other) {
+    return Other.VRegOrUnit == VRegOrUnit;
   });
   if (I == RegUnits.end()) {
-    RegUnits.emplace_back(RegUnit, LaneBitmask::getNone());
+    RegUnits.emplace_back(VRegOrUnit, LaneBitmask::getNone());
   } else {
     I->LaneMask = LaneBitmask::getNone();
   }
@@ -401,10 +405,10 @@ static void setRegZero(SmallVectorImpl<VRegMaskOrUnit> &RegUnits,
 
 static void removeRegLanes(SmallVectorImpl<VRegMaskOrUnit> &RegUnits,
                            VRegMaskOrUnit Pair) {
-  Register RegUnit = Pair.RegUnit;
+  VirtRegOrUnit VRegOrUnit = Pair.VRegOrUnit;
   assert(Pair.LaneMask.any());
-  auto I = llvm::find_if(RegUnits, [RegUnit](const VRegMaskOrUnit Other) {
-    return Other.RegUnit == RegUnit;
+  auto I = llvm::find_if(RegUnits, [VRegOrUnit](const VRegMaskOrUnit Other) {
+    return Other.VRegOrUnit == VRegOrUnit;
   });
   if (I != RegUnits.end()) {
     I->LaneMask &= ~Pair.LaneMask;
@@ -415,11 +419,11 @@ static void removeRegLanes(SmallVectorImpl<VRegMaskOrUnit> &RegUnits,
 
 static LaneBitmask
 getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
-                     bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
-                     LaneBitmask SafeDefault,
+                     bool TrackLaneMasks, VirtRegOrUnit VRegOrUnit,
+                     SlotIndex Pos, LaneBitmask SafeDefault,
                      bool (*Property)(const LiveRange &LR, SlotIndex Pos)) {
-  if (RegUnit.isVirtual()) {
-    const LiveInterval &LI = LIS.getInterval(RegUnit);
+  if (VRegOrUnit.isVirtualReg()) {
+    const LiveInterval &LI = LIS.getInterval(VRegOrUnit.asVirtualReg());
     LaneBitmask Result;
     if (TrackLaneMasks && LI.hasSubRanges()) {
         for (const LiveInterval::SubRange &SR : LI.subranges()) {
@@ -427,13 +431,14 @@ getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
             Result |= SR.LaneMask;
         }
     } else if (Property(LI, Pos)) {
-      Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit)
-                              : LaneBitmask::getAll();
+      Result = TrackLaneMasks
+                   ? MRI.getMaxLaneMaskForVReg(VRegOrUnit.asVirtualReg())
+                   : LaneBitmask::getAll();
     }
 
     return Result;
   } else {
-    const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
+    const LiveRange *LR = LIS.getCachedRegUnit(VRegOrUnit.asMCRegUnit());
     // Be prepared for missing liveranges: We usually do not compute liveranges
     // for physical registers on targets with many registers (GPUs).
     if (LR == nullptr)
@@ -444,13 +449,11 @@ getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
 
 static LaneBitmask getLiveLanesAt(const LiveIntervals &LIS,
                                   const MachineRegisterInfo &MRI,
-                                  bool TrackLaneMasks, Register RegUnit,
+                                  bool TrackLaneMasks, VirtRegOrUnit VRegOrUnit,
                                   SlotIndex Pos) {
-  return getLanesWithProperty(LIS, MRI, TrackLaneMasks, RegUnit, Pos,
-                              LaneBitmask::getAll(),
-                              [](const LiveRange &LR, SlotIndex Pos) {
-                                return LR.liveAt(Pos);
-                              });
+  return getLanesWithProperty(
+      LIS, MRI, TrackLaneMasks, VRegOrUnit, Pos, LaneBitmask::getAll(),
+      [](const LiveRange &LR, SlotIndex Pos) { return LR.liveAt(Pos); });
 }
 
 namespace {
@@ -514,10 +517,12 @@ class RegisterOperandsCollector {
 
   void pushReg(Register Reg, SmallVectorImpl<VRegMaskOrUnit> &RegUnits) const {
     if (Reg.isVirtual()) {
-      addRegLanes(RegUnits, VRegMaskOrUnit(Reg, LaneBitmask::getAll()));
+      addRegLanes(RegUnits,
+                  VRegMaskOrUnit(VirtRegOrUnit(Reg), LaneBitmask::getAll()));
     } else if (MRI.isAllocatable(Reg)) {
       for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg()))
-        addRegLanes(RegUnits, VRegMaskOrUnit(Unit, LaneBitmask::getAll()));
+        addRegLanes(RegUnits,
+                    VRegMaskOrUnit(VirtRegOrUnit(Unit), LaneBitmask::getAll()));
     }
   }
 
@@ -549,10 +554,11 @@ class RegisterOperandsCollector {
       LaneBitmask LaneMask = SubRegIdx != 0
                              ? TRI.getSubRegIndexLaneMask(SubRegIdx)
                              : MRI.getMaxLaneMaskForVReg(Reg);
-      addRegLanes(RegUnits, VRegMaskOrUnit(Reg, LaneMask));
+      addRegLanes(RegUnits, VRegMaskOrUnit(VirtRegOrUnit(Reg), LaneMask));
     } else if (MRI.isAllocatable(Reg)) {
       for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg()))
-        addRegLanes(RegUnits, VRegMaskOrUnit(Unit, LaneBitmask::getAll()));
+        addRegLanes(RegUnits,
+                    VRegMaskOrUnit(VirtRegOrUnit(Unit), LaneBitmask::getAll()));
     }
   }
 };
@@ -574,8 +580,7 @@ void RegisterOperands::detectDeadDefs(const MachineInstr &MI,
                                       const LiveIntervals &LIS) {
   SlotIndex SlotIdx = LIS.getInstructionIndex(MI);
   for (auto *RI = Defs.begin(); RI != Defs.end(); /*empty*/) {
-    Register Reg = RI->RegUnit;
-    const LiveRange *LR = getLiveRange(LIS, Reg);
+    const LiveRange *LR = getLiveRange(LIS, RI->VRegOrUnit);
     if (LR != nullptr) {
       LiveQueryResult LRQ = LR->Query(SlotIdx);
       if (LRQ.isDeadDef()) {
@@ -595,14 +600,14 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
                                           SlotIndex Pos,
                                           MachineInstr *AddFlagsMI) {
   for (auto *I = Defs.begin(); I != Defs.end();) {
-    LaneBitmask LiveAfter = getLiveLanesAt(LIS, MRI, true, I->RegUnit,
-                                           Pos.getDeadSlot());
+    LaneBitmask LiveAfter =
+        getLiveLanesAt(LIS, MRI, true, I->VRegOrUnit, Pos.getDeadSlot());
     // If the def is all that is live after the instruction, then in case
     // of a subregister def we need a read-undef flag.
-    Register RegUnit = I->RegUnit;
-    if (RegUnit.isVirtual() && AddFlagsMI != nullptr &&
+    VirtRegOrUnit VRegOrUnit = I->VRegOrUnit;
+    if (VRegOrUnit.isVirtualReg() && AddFlagsMI != nullptr &&
         (LiveAfter & ~I->LaneMask).none())
-      AddFlagsMI->setRegisterDefReadUndef(RegUnit);
+      AddFlagsMI->setRegisterDefReadUndef(VRegOrUnit.asVirtualReg());
 
     LaneBitmask ActualDef = I->LaneMask & LiveAfter;
     if (ActualDef.none()) {
@@ -614,18 +619,18 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
   }
 
   // For uses just copy the information from LIS.
-  for (auto &[RegUnit, LaneMask] : Uses)
-    LaneMask = getLiveLanesAt(LIS, MRI, true, RegUnit, Pos.getBaseIndex());
+  for (auto &[VRegOrUnit, LaneMask] : Uses)
+    LaneMask = getLiveLanesAt(LIS, MRI, true, VRegOrUnit, Pos.getBaseIndex());
 
   if (AddFlagsMI != nullptr) {
     for (const VRegMaskOrUnit &P : DeadDefs) {
-      Register RegUnit = P.RegUnit;
-      if (!RegUnit.isVirtual())
+      VirtRegOrUnit VRegOrUnit = P.VRegOrUnit;
+      if (!VRegOrUnit.isVirtualReg())
         continue;
-      LaneBitmask LiveAfter = getLiveLanesAt(LIS, MRI, true, RegUnit,
-                                             Pos.getDeadSlot());
+      LaneBitmask LiveAfter =
+          getLiveLanesAt(LIS, MRI, true, VRegOrUnit, Pos.getDeadSlot());
       if (LiveAfter.none())
-        AddFlagsMI->setRegisterDefReadUndef(RegUnit);
+        AddFlagsMI->setRegisterDefReadUndef(VRegOrUnit.asVirtualReg());
     }
   }
 }
@@ -648,16 +653,16 @@ void PressureDiffs::addInstruction(unsigned Idx,
   PressureDiff &PDiff = (*this)[Idx];
   assert(!PDiff.begin()->isValid() && "stale PDiff");
   for (const VRegMaskOrUnit &P : RegOpers.Defs)
-    PDiff.addPressureChange(P.RegUnit, true, &MRI);
+    PDiff.addPressureChange(P.VRegOrUnit, true, &MRI);
 
   for (const VRegMaskOrUnit &P : RegOpers.Uses)
-    PDiff.addPressureChange(P.RegUnit, false, &MRI);
+    PDiff.addPressureChange(P.VRegOrUnit, false, &MRI);
 }
 
 /// Add a change in pressure to the pressure diff of a given instruction.
-void PressureDiff::addPressureChange(Register RegUnit, bool IsDec,
+void PressureDiff::addPressureChange(VirtRegOrUnit VRegOrUnit, bool IsDec,
                                      const MachineRegisterInfo *MRI) {
-  PSetIterator PSetI = MRI->getPressureSets(RegUnit);
+  PSetIterator PSetI = MRI->getPressureSets(VRegOrUnit);
   int Weight = IsDec ? -PSetI.getWeight() : PSetI.getWeight();
   for (; PSetI.isValid(); ++PSetI) {
     // Find an existing entry in the pressure diff for this PSet.
@@ -694,7 +699,7 @@ void RegPressureTracker::addLiveRegs(ArrayRef<VRegMaskOrUnit> Regs) {
   for (const VRegMaskOrUnit &P : Regs) {
     LaneBitmask PrevMask = LiveRegs.insert(P);
     LaneBitmask NewMask = PrevMask | P.LaneMask;
-    increaseRegPressure(P.RegUnit, PrevMask, NewMask);
+    increaseRegPressure(P.VRegOrUnit, PrevMask, NewMask);
   }
 }
 
@@ -702,9 +707,9 @@ void RegPressureTracker::discoverLiveInOrOut(
     VRegMaskOrUnit Pair, SmallVectorImpl<VRegMaskOrUnit> &LiveInOrOut) {
   assert(Pair.LaneMask.any());
 
-  Register RegUnit = Pair.RegUnit;
-  auto I = llvm::find_if(LiveInOrOut, [RegUnit](const VRegMaskOrUnit &Other) {
-    return Other.RegUnit == RegUnit;
+  VirtRegOrUnit VRegOrUnit = Pair.VRegOrUnit;
+  auto I = find_if(LiveInOrOut, [VRegOrUnit](const VRegMaskOrUnit &Other) {
+    return Other.VRegOrUnit == VRegOrUnit;
   });
   LaneBitmask PrevMask;
   LaneBitmask NewMask;
@@ -717,7 +722,7 @@ void RegPressureTracker::discoverLiveInOrOut(
     NewMask = PrevMask | Pair.LaneMask;
     I->LaneMask = NewMask;
   }
-  increaseSetPressure(P.MaxSetPressure, *MRI, RegUnit, PrevMask, NewMask);
+  increaseSetPressure(P.MaxSetPressure, *MRI, VRegOrUnit, PrevMask, NewMask);
 }
 
 void RegPressureTracker::discoverLiveIn(VRegMaskOrUnit Pair) {
@@ -730,16 +735,14 @@ void RegPressureTracker::discoverLiveOut(VRegMaskOrUnit Pair) {
 
 void RegPressureTracker::bumpDeadDefs(ArrayRef<VRegMaskOrUnit> DeadDefs) {
   for (const VRegMaskOrUnit &P : DeadDefs) {
-    Register Reg = P.RegUnit;
-    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask LiveMask = LiveRegs.contains(P.VRegOrUnit);
     LaneBitmask BumpedMask = LiveMask | P.LaneMask;
-    increaseRegPressure(Reg, LiveMask, BumpedMask);
+    increaseRegPressure(P.VRegOrUnit, LiveMask, BumpedMask);
   }
   for (const VRegMaskOrUnit &P : DeadDefs) {
-    Register Reg = P.RegUnit;
-    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask LiveMask = LiveRegs.contains(P.VRegOrUnit);
     LaneBitmask BumpedMask = LiveMask | P.LaneMask;
-    decreaseRegPressure(Reg, BumpedMask, LiveMask);
+    decreaseRegPressure(P.VRegOrUnit, BumpedMask, LiveMask);
   }
 }
 
@@ -758,17 +761,17 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers,
   // Kill liveness at live defs.
   // TODO: consider earlyclobbers?
   for (const VRegMaskOrUnit &Def : RegOpers.Defs) {
-    Register Reg = Def.RegUnit;
+    VirtRegOrUnit VRegOrUnit = Def.VRegOrUnit;
 
     LaneBitmask PreviousMask = LiveRegs.erase(Def);
     LaneBitmask NewMask = PreviousMask & ~Def.LaneMask;
 
     LaneBitmask LiveOut = Def.LaneMask & ~PreviousMask;
     if (LiveOut.any()) {
-      discoverLiveOut(VRegMaskOrUnit(Reg, LiveOut));
+      discoverLiveOut(VRegMaskOrUnit(VRegOrUnit, LiveOut));
       // Retroactively model effects on pressure of the live out lanes.
-      increaseSetPressure(CurrSetPressure, *MRI, Reg, LaneBitmask::getNone(),
-                          LiveOut);
+      increaseSetPressure(CurrSetPressure, *MRI, VRegOrUnit,
+                          LaneBitmask::getNone(), LiveOut);
       PreviousMask = LiveOut;
     }
 
@@ -776,10 +779,10 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers,
       // Add a 0 entry to LiveUses as a marker that the complete vreg has become
       // dead.
       if (TrackLaneMasks && LiveUses != nullptr)
-        setRegZero(*LiveUses, Reg);
+        setRegZero(*LiveUses, VRegOrUnit);
     }
 
-    decreaseRegPressure(Reg, PreviousMask, NewMask);
+    decreaseRegPressure(VRegOrUnit, PreviousMask, NewMask);
   }
 
   SlotIndex SlotIdx;
@@ -788,7 +791,7 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers,
 
   // Generate liveness for uses.
   for (const VRegMaskOrUnit &Use : RegOpers.Uses) {
-    Register Reg = Use.RegUnit;
+    VirtRegOrUnit VRegOrUnit = Use.VRegOrUnit;
     assert(Use.LaneMask.any());
     LaneBitmask PreviousMask = LiveRegs.insert(Use);
     LaneBitmask NewMask = PreviousMask | Use.LaneMask;
@@ -799,38 +802,38 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers,
     if (PreviousMask.none()) {
       if (LiveUses != nullptr) {
         if (!TrackLaneMasks) {
-          addRegLanes(*LiveUses, VRegMaskOrUnit(Reg, NewMask));
+          addRegLanes(*LiveUses, VRegMaskOrUnit(VRegOrUnit, NewMask));
         } else {
-          auto I = llvm::find_if(*LiveUses, [Reg](const VRegMaskOrUnit Other) {
-            return Other.RegUnit == Reg;
+          auto I = find_if(*LiveUses, [VRegOrUnit](const VRegMaskOrUnit Other) {
+            return Other.VRegOrUnit == VRegOrUnit;
           });
           bool IsRedef = I != LiveUses->end();
           if (IsRedef) {
             // ignore re-defs here...
             assert(I->LaneMask.none());
-            removeRegLanes(*LiveUses, VRegMaskOrUnit(Reg, NewMask));
+            removeRegLanes(*LiveUses, VRegMaskOrUnit(VRegOrUnit, NewMask));
           } else {
-            addRegLanes(*LiveUses, VRegMaskOrUnit(Reg, NewMask));
+            addRegLanes(*LiveUses, VRegMaskOrUnit(VRegOrUnit, NewMask));
           }
         }
       }
 
       // Discover live outs if this may be the first occurance of this register.
       if (RequireIntervals) {
-        LaneBitmask LiveOut = getLiveThroughAt(Reg, SlotIdx);
+        LaneBitmask LiveOut = getLiveThroughAt(VRegOrUnit, SlotIdx);
         if (LiveOut.any())
-          discoverLiveOut(VRegMaskOrUnit(Reg, LiveOut));
+          discoverLiveOut(VRegMaskOrUnit(VRegOrUnit, LiveOut));
       }
     }
 
-    increaseRegPressure(Reg, PreviousMask, NewMask);
+    increaseRegPressure(VRegOrUnit, PreviousMask, NewMask);
   }
   if (TrackUntiedDefs) {
     for (const VRegMaskOrUnit &Def : RegOpers.Defs) {
-      Register RegUnit = Def.RegUnit;
-      if (RegUnit.isVirtual() &&
-          (LiveRegs.contains(RegUnit) & Def.LaneMask).none())
-        UntiedDefs.insert(RegUnit);
+      VirtRegOrUnit VRegOrUnit = Def.VRegOrUnit;
+      if (VRegOrUnit.isVirtualReg() &&
+          (LiveRegs.contains(VRegOrUnit) & Def.LaneMask).none())
+        UntiedDefs.insert(VRegOrUnit.asVirtualReg());
     }
   }
 }
@@ -898,20 +901,20 @@ void RegPressureTracker::advance(const RegisterOperands &RegOpers) {
   }
 
   for (const VRegMaskOrUnit &Use : RegOpers.Uses) {
-    Register Reg = Use.RegUnit;
-    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    VirtRegOrUnit VRegOrUnit = Use.VRegOrUnit;
+    LaneBitmask LiveMask = LiveRegs.contains(VRegOrUnit);
     LaneBitmask LiveIn = Use.LaneMask & ~LiveMask;
     if (LiveIn.any()) {
-      discoverLiveIn(VRegMaskOrUnit(Reg, LiveIn));
-      increaseRegPressure(Reg, LiveMask, LiveMask | LiveIn);
-      LiveRegs.insert(VRegMaskOrUnit(Reg, LiveIn));
+      discoverLiveIn(VRegMaskOrUnit(VRegOrUnit, LiveIn));
+      increaseRegPressure(VRegOrUnit, LiveMask, LiveMask | LiveIn);
+      LiveRegs.insert(VRegMaskOrUnit(VRegOrUnit, LiveIn));
     }
     // Kill liveness at last uses.
     if (RequireIntervals) {
-      LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
+      LaneBitmask LastUseMask = getLastUsedLanes(VRegOrUnit, SlotIdx);
       if (LastUseMask.any()) {
-        LiveRegs.erase(VRegMaskOrUnit(Reg, LastUseMask));
-        decreaseRegPressure(Reg, LiveMask, LiveMask & ~LastUseMask);
+        LiveRegs.erase(VRegMaskOrUnit(VRegOrUnit, LastUseMask));
+        decreaseRegPressure(VRegOrUnit, LiveMask, LiveMask & ~LastUseMask);
       }
     }
   }
@@ -920,7 +923,7 @@ void RegPressureTracker::advance(const RegisterOperands &RegOpers) {
   for (const VRegMaskOrUnit &Def : RegOpers.Defs) {
     LaneBitmask PreviousMask = LiveRegs.insert(Def);
     LaneBitmask NewMask = PreviousMask | Def.LaneMask;
-    increaseRegPressure(Def.RegUnit, PreviousMask, NewMask);
+    increaseRegPressure(Def.VRegOrUnit, PreviousMask, NewMask);
   }
 
   // Boost pressure for all dead defs together.
@@ -1047,22 +1050,20 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
 
   // Kill liveness at live defs.
   for (const VRegMaskOrUnit &P : RegOpers.Defs) {
-    Register Reg = P.RegUnit;
-    LaneBitmask LiveAfter = LiveRegs.contains(Reg);
-    LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg);
+    LaneBitmask LiveAfter = LiveRegs.contains(P.VRegOrUnit);
+    LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, P.VRegOrUnit);
     LaneBitmask DefLanes = P.LaneMask;
     LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes;
 
     // There may be parts of the register that were dead before the
     // instruction, but became live afterwards.
-    decreaseRegPressure(Reg, LiveAfter, LiveAfter & LiveBefore);
+    decreaseRegPressure(P.VRegOrUnit, LiveAfter, LiveAfter & LiveBefore);
   }
   // Generate liveness for uses. Also handle any uses which overlap with defs.
   for (const VRegMaskOrUnit &P : RegOpers.Uses) {
-    Register Reg = P.RegUnit;
-    LaneBitmask LiveAfter = LiveRegs.contains(Reg);
+    LaneBitmask LiveAfter = LiveRegs.contains(P.VRegOrUnit);
     LaneBitmask LiveBefore = LiveAfter | P.LaneMask;
-    increaseRegPressure(Reg, LiveAfter, LiveBefore);
+    increaseRegPressure(P.VRegOrUnit, LiveAfter, LiveBefore);
   }
 }
 
@@ -1209,11 +1210,17 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
 /// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
 /// The query starts with a lane bitmask which gets lanes/bits removed for every
 /// use we find.
-static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
+static LaneBitmask findUseBetween(VirtRegOrUnit VRegOrUnit,
+                                  LaneBitmask LastUseMask,
                                   SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
                                   const MachineRegisterInfo &MRI,
                                   const LiveIntervals *LIS) {
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  // FIXME: The static_cast is a bug.
+  Register Reg =
+      VRegOrUnit.isVirtualReg()
+          ? VRegOrUnit.asVirtualReg()
+          : Register(static_cast<unsigned>(VRegOrUnit.asMCRegUnit()));
   for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
     if (MO.isUndef())
       continue;
@@ -1230,32 +1237,30 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
   return LastUseMask;
 }
 
-LaneBitmask RegPressureTracker::getLiveLanesAt(Register RegUnit,
+LaneBitmask RegPressureTracker::getLiveLanesAt(VirtRegOrUnit VRegOrUnit,
                                                SlotIndex Pos) const {
   assert(RequireIntervals);
-  return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit, Pos,
-                              LaneBitmask::getAll(),
-      [](const LiveRange &LR, SlotIndex Pos) {
-        return LR.liveAt(Pos);
-      });
+  return getLanesWithProperty(
+      *LIS, *MRI, TrackLaneMasks, VRegOrUnit, Pos, LaneBitmask::getAll(),
+      [](const LiveRange &LR, SlotIndex Pos) { return LR.liveAt(Pos); });
 }
 
-LaneBitmask RegPressureTracker::getLastUsedLanes(Register RegUnit,
+LaneBitmask RegPressureTracker::getLastUsedLanes(VirtRegOrUnit VRegOrUnit,
                                                  SlotIndex Pos) const {
   assert(RequireIntervals);
-  return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit,
-                              Pos.getBaseIndex(), LaneBitmask::getNone(),
-      [](const LiveRange &LR, SlotIndex Pos) {
+  return getLanesWithProperty(
+      *LIS, *MRI, TrackLaneMasks, VRegOrUnit, Pos.getBaseIndex(),
+      LaneBitmask::getNone(), [](const LiveRange &LR, SlotIndex Pos) {
         const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
         return S != nullptr && S->end == Pos.getRegSlot();
       });
 }
 
-LaneBitmask RegPressureTracker::getLiveThroughAt(Register RegUnit,
+LaneBitmask RegPressureTracker::getLiveThroughAt(VirtRegOrUnit VRegOrUnit,
                                                  SlotIndex Pos) const {
   assert(RequireIntervals);
-  return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit, Pos,
-                              LaneBitmask::getNone(),
+  return getLanesWithProperty(
+      *LIS, *MRI, TrackLaneMasks, VRegOrUnit, Pos, LaneBitmask::getNone(),
       [](const LiveRange &LR, SlotIndex Pos) {
         const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
         return S != nullptr && S->start < Pos.getRegSlot(true) &&
@@ -1284,8 +1289,8 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
 
   if (RequireIntervals) {
     for (const VRegMaskOrUnit &Use : RegOpers.Uses) {
-      Register Reg = Use.RegUnit;
-      LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
+      VirtRegOrUnit VRegOrUnit = Use.VRegOrUnit;
+      LaneBitmask LastUseMask = getLastUsedLanes(VRegOrUnit, SlotIdx);
       if (LastUseMask.none())
         continue;
       // The LastUseMask is queried from the liveness information of instruction
@@ -1294,23 +1299,22 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
       // FIXME: allow the caller to pass in the list of vreg uses that remain
       // to be bottom-scheduled to avoid searching uses at each query.
       SlotIndex CurrIdx = getCurrSlot();
-      LastUseMask
-        = findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, LIS);
+      LastUseMask =
+          findUseBetween(VRegOrUnit, LastUseMask, CurrIdx, SlotIdx, *MRI, LIS);
       if (LastUseMask.none())
         continue;
 
-      LaneBitmask LiveMask = LiveRegs.contains(Reg);
+      LaneBitmask LiveMask = LiveRegs.contains(VRegOrUnit);
       LaneBitmask NewMask = LiveMask & ~LastUseMask;
-      decreaseRegPressure(Reg, LiveMask, NewMask);
+      decreaseRegPressure(VRegOrUnit, LiveMask, NewMask);
     }
   }
 
   // Generate liveness for defs.
   for (const VRegMaskOrUnit &Def : RegOpers.Defs) {
-    Register Reg = Def.RegUnit;
-    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask LiveMask = LiveRegs.contains(Def.VRegOrUnit);
     LaneBitmask NewMask = LiveMask | Def.LaneMask;
-    increaseRegPressure(Reg, LiveMask, NewMask);
+    increaseRegPressure(Def.VRegOrUnit, LiveMask, NewMask);
   }
 
   // Boost pressure for all dead defs together.
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index a5c81afc57a80..975895809b9de 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -156,12 +156,13 @@ Printable llvm::printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI) {
   });
 }
 
-Printable llvm::printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
-  return Printable([Unit, TRI](raw_ostream &OS) {
-    if (Register::isVirtualRegister(Unit)) {
-      OS << '%' << Register(Unit).virtRegIndex();
+Printable llvm::printVRegOrUnit(VirtRegOrUnit VRegOrUnit,
+                                const TargetRegisterInfo *TRI) {
+  return Printable([VRegOrUnit, TRI](raw_ostream &OS) {
+    if (VRegOrUnit.isVirtualReg()) {
+      OS << '%' << VRegOrUnit.asVirtualReg().virtRegIndex();
     } else {
-      OS << printRegUnit(Unit, TRI);
+      OS << printRegUnit(VRegOrUnit.asMCRegUnit(), TRI);
     }
   });
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 4e11c4ff3d56e..0c5e3d0837800 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -282,11 +282,12 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits,
 
     Register Reg = MO.getReg();
     auto I = llvm::find_if(VRegMaskOrUnits, [Reg](const VRegMaskOrUnit &RM) {
-      return RM.RegUnit == Reg;
+      return RM.VRegOrUnit.asVirtualReg() == Reg;
     });
 
     auto &P = I == VRegMaskOrUnits.end()
-                  ? VRegMaskOrUnits.emplace_back(Reg, LaneBitmask::getNone())
+                  ? VRegMaskOrUnits.emplace_back(VirtRegOrUnit(Reg),
+                                                 LaneBitmask::getNone())
                   : *I;
 
     P.LaneMask |= MO.getSubReg() ? TRI.getSubRegIndexLaneMask(MO.getSubReg())
@@ -295,7 +296,7 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits,
 
   SlotIndex InstrSI;
   for (auto &P : VRegMaskOrUnits) {
-    auto &LI = LIS.getInterval(P.RegUnit);
+    auto &LI = LIS.getInterval(P.VRegOrUnit.asVirtualReg());
     if (!LI.hasSubRanges())
       continue;
 
@@ -562,10 +563,10 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   SmallVector<VRegMaskOrUnit, 8> RegUses;
   collectVirtualRegUses(RegUses, MI, LIS, *MRI);
   for (const VRegMaskOrUnit &U : RegUses) {
-    LaneBitmask &LiveMask = LiveRegs[U.RegUnit];
+    LaneBitmask &LiveMask = LiveRegs[U.VRegOrUnit.asVirtualReg()];
     LaneBitmask PrevMask = LiveMask;
     LiveMask |= U.LaneMask;
-    CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI);
+    CurPressure.inc(U.VRegOrUnit.asVirtualReg(), PrevMask, LiveMask, *MRI);
   }
 
   // Update MaxPressure with uses plus early-clobber defs pressure.
@@ -748,9 +749,9 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
   GCNRegPressure TempPressure = CurPressure;
 
   for (const VRegMaskOrUnit &Use : RegOpers.Uses) {
-    Register Reg = Use.RegUnit;
-    if (!Reg.isVirtual())
+    if (!Use.VRegOrUnit.isVirtualReg())
       continue;
+    Register Reg = Use.VRegOrUnit.asVirtualReg();
     LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
     if (LastUseMask.none())
       continue;
@@ -782,9 +783,9 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
 
   // Generate liveness for defs.
   for (const VRegMaskOrUnit &Def : RegOpers.Defs) {
-    Register Reg = Def.RegUnit;
-    if (!Reg.isVirtual())
+    if (!Def.VRegOrUnit.isVirtualReg())
       continue;
+    Register Reg = Def.VRegOrUnit.asVirtualReg();
     auto It = LiveRegs.find(Reg);
     LaneBitmask LiveMask = It != LiveRegs.end() ? It->second : LaneBitmask(0);
     LaneBitmask NewMask = LiveMask | Def.LaneMask;
@@ -824,8 +825,7 @@ Printable llvm::print(const GCNRPTracker::LiveRegSet &LiveRegs,
       Register Reg = Register::index2VirtReg(I);
       auto It = LiveRegs.find(Reg);
       if (It != LiveRegs.end() && It->second.any())
-        OS << ' ' << printVRegOrUnit(Reg, TRI) << ':'
-           << PrintLaneMask(It->second);
+        OS << ' ' << printReg(Reg, TRI) << ':' << PrintLaneMask(It->second);
     }
     OS << '\n';
   });
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index fd28abeb887c2..2f3ad39c75fcc 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -323,8 +323,8 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
 
   // Do not Track Physical Registers, because it messes up.
   for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
-    if (RegMaskPair.RegUnit.isVirtual())
-      LiveInRegs.insert(RegMaskPair.RegUnit);
+    if (RegMaskPair.VRegOrUnit.isVirtualReg())
+      LiveInRegs.insert(RegMaskPair.VRegOrUnit.asVirtualReg());
   }
   LiveOutRegs.clear();
   // There is several possibilities to distinguish:
@@ -350,12 +350,13 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
   // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
   // The use of findDefBetween removes the case 4.
   for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
-    Register Reg = RegMaskPair.RegUnit;
-    if (Reg.isVirtual() &&
-        isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
+    VirtRegOrUnit VRegOrUnit = RegMaskPair.VRegOrUnit;
+    if (VRegOrUnit.isVirtualReg() &&
+        isDefBetween(VRegOrUnit.asVirtualReg(),
+                     LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
                      LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
                      LIS)) {
-      LiveOutRegs.insert(Reg);
+      LiveOutRegs.insert(VRegOrUnit.asVirtualReg());
     }
   }
 
@@ -578,11 +579,11 @@ void SIScheduleBlock::printDebug(bool full) {
            << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
     dbgs() << "LiveIns:\n";
     for (Register Reg : LiveInRegs)
-      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+      dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
 
     dbgs() << "\nLiveOuts:\n";
     for (Register Reg : LiveOutRegs)
-      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+      dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
   }
 
   dbgs() << "\nInstructions:\n";
@@ -1446,23 +1447,24 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
   }
 #endif
 
-  std::set<Register> InRegs = DAG->getInRegs();
+  std::set<VirtRegOrUnit> InRegs = DAG->getInRegs();
   addLiveRegs(InRegs);
 
   // Increase LiveOutRegsNumUsages for blocks
   // producing registers consumed in another
   // scheduling region.
-  for (Register Reg : DAG->getOutRegs()) {
+  for (VirtRegOrUnit VRegOrUnit : DAG->getOutRegs()) {
     for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
       // Do reverse traversal
       int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
       SIScheduleBlock *Block = Blocks[ID];
       const std::set<Register> &OutRegs = Block->getOutRegs();
 
-      if (OutRegs.find(Reg) == OutRegs.end())
+      if (!VRegOrUnit.isVirtualReg() ||
+          OutRegs.find(VRegOrUnit.asVirtualReg()) == OutRegs.end())
         continue;
 
-      ++LiveOutRegsNumUsages[ID][Reg];
+      ++LiveOutRegsNumUsages[ID][VRegOrUnit.asVirtualReg()];
       break;
     }
   }
@@ -1565,15 +1567,18 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
     maxVregUsage = VregCurrentUsage;
   if (SregCurrentUsage > maxSregUsage)
     maxSregUsage = SregCurrentUsage;
-  LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
-             for (SIScheduleBlock *Block : ReadyBlocks)
-               dbgs() << Block->getID() << ' ';
-             dbgs() << "\nCurrent Live:\n";
-             for (Register Reg : LiveRegs)
-               dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
-             dbgs() << '\n';
-             dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
-             dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
+  LLVM_DEBUG({
+    dbgs() << "Picking New Blocks\n";
+    dbgs() << "Available: ";
+    for (SIScheduleBlock *Block : ReadyBlocks)
+      dbgs() << Block->getID() << ' ';
+    dbgs() << "\nCurrent Live:\n";
+    for (Register Reg : LiveRegs)
+      dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
+    dbgs() << '\n';
+    dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+    dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
+  });
 
   Cand.Block = nullptr;
   for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
@@ -1625,13 +1630,13 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
 
 // Tracking of currently alive registers to determine VGPR Usage.
 
-void SIScheduleBlockScheduler::addLiveRegs(std::set<Register> &Regs) {
-  for (Register Reg : Regs) {
+void SIScheduleBlockScheduler::addLiveRegs(std::set<VirtRegOrUnit> &Regs) {
+  for (VirtRegOrUnit VRegOrUnit : Regs) {
     // For now only track virtual registers.
-    if (!Reg.isVirtual())
+    if (!VRegOrUnit.isVirtualReg())
       continue;
     // If not already in the live set, then add it.
-    (void) LiveRegs.insert(Reg);
+    (void)LiveRegs.insert(VRegOrUnit.asVirtualReg());
   }
 }
 
@@ -1662,7 +1667,7 @@ void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
 
 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
   decreaseLiveRegs(Block, Block->getInRegs());
-  addLiveRegs(Block->getOutRegs());
+  LiveRegs.insert(Block->getOutRegs().begin(), Block->getOutRegs().end());
   releaseBlockSuccs(Block);
   for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) {
     // We produce this register, thus it must not be previously alive.
@@ -1689,7 +1694,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs,
       continue;
     if (LiveRegsConsumers[Reg] > 1)
       continue;
-    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(VirtRegOrUnit(Reg));
     for (; PSetI.isValid(); ++PSetI) {
       DiffSetPressure[*PSetI] -= PSetI.getWeight();
     }
@@ -1699,7 +1704,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs,
     // For now only track virtual registers.
     if (!Reg.isVirtual())
       continue;
-    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(VirtRegOrUnit(Reg));
     for (; PSetI.isValid(); ++PSetI) {
       DiffSetPressure[*PSetI] += PSetI.getWeight();
     }
@@ -1846,7 +1851,7 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
     // For now only track virtual registers
     if (!Reg.isVirtual())
       continue;
-    PSetIterator PSetI = MRI.getPressureSets(Reg);
+    PSetIterator PSetI = MRI.getPressureSets(VirtRegOrUnit(Reg));
     for (; PSetI.isValid(); ++PSetI) {
       if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
         VgprUsage += PSetI.getWeight();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index b219cbd5672f0..1245774400af1 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -389,7 +389,7 @@ class SIScheduleBlockScheduler {
                             SIBlockSchedCandidate &TryCand);
   SIScheduleBlock *pickBlock();
 
-  void addLiveRegs(std::set<Register> &Regs);
+  void addLiveRegs(std::set<VirtRegOrUnit> &Regs);
   void decreaseLiveRegs(SIScheduleBlock *Block, std::set<Register> &Regs);
   void releaseBlockSuccs(SIScheduleBlock *Parent);
   void blockScheduled(SIScheduleBlock *Block);
@@ -462,18 +462,18 @@ class SIScheduleDAGMI final : public ScheduleDAGMILive {
                                                      unsigned &VgprUsage,
                                                      unsigned &SgprUsage);
 
-  std::set<Register> getInRegs() {
-    std::set<Register> InRegs;
+  std::set<VirtRegOrUnit> getInRegs() {
+    std::set<VirtRegOrUnit> InRegs;
     for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
-      InRegs.insert(RegMaskPair.RegUnit);
+      InRegs.insert(RegMaskPair.VRegOrUnit);
     }
     return InRegs;
   }
 
-  std::set<unsigned> getOutRegs() {
-    std::set<unsigned> OutRegs;
+  std::set<VirtRegOrUnit> getOutRegs() {
+    std::set<VirtRegOrUnit> OutRegs;
     for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
-      OutRegs.insert(RegMaskPair.RegUnit);
+      OutRegs.insert(RegMaskPair.VRegOrUnit);
     }
     return OutRegs;
   };
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 6611e1e6507e1..10762edc16264 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -188,8 +188,9 @@ class SIWholeQuadMode {
 
   void markInstruction(MachineInstr &MI, char Flag,
                        std::vector<WorkItem> &Worklist);
-  void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
-                unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
+  void markDefs(const MachineInstr &UseMI, LiveRange &LR,
+                VirtRegOrUnit VRegOrUnit, unsigned SubReg, char Flag,
+                std::vector<WorkItem> &Worklist);
   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
                    std::vector<WorkItem> &Worklist);
   void markInstructionUses(const MachineInstr &MI, char Flag,
@@ -318,8 +319,8 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
 
 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
-                               Register Reg, unsigned SubReg, char Flag,
-                               std::vector<WorkItem> &Worklist) {
+                               VirtRegOrUnit VRegOrUnit, unsigned SubReg,
+                               char Flag, std::vector<WorkItem> &Worklist) {
   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
 
   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
@@ -331,8 +332,9 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
   // cover registers.
   const LaneBitmask UseLanes =
       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
-             : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
-                                : LaneBitmask::getNone());
+             : (VRegOrUnit.isVirtualReg()
+                    ? MRI->getMaxLaneMaskForVReg(VRegOrUnit.asVirtualReg())
+                    : LaneBitmask::getNone());
 
   // Perform a depth-first iteration of the LiveRange graph marking defs.
   // Stop processing of a given branch when all use lanes have been defined.
@@ -382,11 +384,11 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
       assert(MI && "Def has no defining instruction");
 
-      if (Reg.isVirtual()) {
+      if (VRegOrUnit.isVirtualReg()) {
         // Iterate over all operands to find relevant definitions
         bool HasDef = false;
         for (const MachineOperand &Op : MI->all_defs()) {
-          if (Op.getReg() != Reg)
+          if (Op.getReg() != VRegOrUnit.asVirtualReg())
             continue;
 
           // Compute lanes defined and overlap with use
@@ -453,7 +455,7 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI,
                     << " for " << MI);
   if (Reg.isVirtual()) {
     LiveRange &LR = LIS->getInterval(Reg);
-    markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
+    markDefs(MI, LR, VirtRegOrUnit(Reg), Op.getSubReg(), Flag, Worklist);
   } else {
     // Handle physical registers that we need to track; this is mostly relevant
     // for VCC, which can appear as the (implicit) input of a uniform branch,
@@ -462,7 +464,8 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI,
       LiveRange &LR = LIS->getRegUnit(Unit);
       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
       if (Value)
-        markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
+        markDefs(MI, LR, VirtRegOrUnit(Unit), AMDGPU::NoSubRegister, Flag,
+                 Worklist);
     }
   }
 }
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 6077c18463240..02887ce93c525 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -6551,7 +6551,7 @@ class ARMPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
   static int constexpr LAST_IS_USE = MAX_STAGES;
   static int constexpr SEEN_AS_LIVE = MAX_STAGES + 1;
   typedef std::bitset<MAX_STAGES + 2> IterNeed;
-  typedef std::map<unsigned, IterNeed> IterNeeds;
+  typedef std::map<Register, IterNeed> IterNeeds;
 
   void bumpCrossIterationPressure(RegPressureTracker &RPT,
                                   const IterNeeds &CIN);
@@ -6625,14 +6625,14 @@ void ARMPipelinerLoopInfo::bumpCrossIterationPressure(RegPressureTracker &RPT,
   for (const auto &N : CIN) {
     int Cnt = N.second.count() - N.second[SEEN_AS_LIVE] * 2;
     for (int I = 0; I < Cnt; ++I)
-      RPT.increaseRegPressure(Register(N.first), LaneBitmask::getNone(),
+      RPT.increaseRegPressure(VirtRegOrUnit(N.first), LaneBitmask::getNone(),
                               LaneBitmask::getAll());
   }
   // Decrease pressure by the amounts in CrossIterationNeeds
   for (const auto &N : CIN) {
     int Cnt = N.second.count() - N.second[SEEN_AS_LIVE] * 2;
     for (int I = 0; I < Cnt; ++I)
-      RPT.decreaseRegPressure(Register(N.first), LaneBitmask::getAll(),
+      RPT.decreaseRegPressure(VirtRegOrUnit(N.first), LaneBitmask::getAll(),
                               LaneBitmask::getNone());
   }
 }

From 4340159400497dfdb15a2fc7b285482d52df5889 Mon Sep 17 00:00:00 2001
From: Asher Dobrescu <asher.dobrescu@gmail.com>
Date: Thu, 13 Nov 2025 10:38:14 +0000
Subject: [PATCH 23/29] [AArch64] Fix SVE FADDP latency on Neoverse-N3
 (#167676)

This patch fixes the latency of the SVE FADDP instruction for the
Neoverse-N3 SWOG. The latency of flaoting point arith, min/max pairwise
SVE FADDP should be 3, as per the N3 SWOG.
---
 llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td           | 4 ++--
 .../tools/llvm-mca/AArch64/Neoverse/N3-sve-instructions.s   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
index 411b372a3f533..6b788772ac889 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
@@ -1926,7 +1926,6 @@ def : InstRW<[N3Write_2c_1V], (instregex "^FAB[SD]_ZPmZ_[HSD]",
 // Floating point arithmetic
 def : InstRW<[N3Write_2c_1V], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ)_[HSD]",
                                          "^F(ADD|SUB)_ZPZ[IZ]_[HSD]",
-                                         "^FADDP_ZPmZZ_[HSD]",
                                          "^FNEG_ZPmZ_[HSD]",
                                          "^FSUBR_ZPm[IZ]_[HSD]",
                                          "^FSUBR_(ZPZI|ZPZZ)_[HSD]")>;
@@ -2001,7 +2000,8 @@ def : InstRW<[N3Write_10c_4V0], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_S")>;
 def : InstRW<[N3Write_13c_2V0], (instregex "^FDIVR?_(ZPmZ|ZPZZ)_D")>;
 
 // Floating point arith, min/max pairwise
-def : InstRW<[N3Write_3c_1V], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>;
+def : InstRW<[N3Write_3c_1V], (instregex "^FADDP_ZPmZZ_[HSD]",
+                                         "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]")>;
 
 // Floating point min/max
 def : InstRW<[N3Write_2c_1V], (instregex "^F(MAX|MIN)(NM)?_ZPm[IZ]_[HSD]",
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-sve-instructions.s
index 395aa1141abb5..aa565f9aded26 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-sve-instructions.s
@@ -3993,9 +3993,9 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  2      4     1.00                        fadda	d0, p7, d0, z31.d
 # CHECK-NEXT:  8      16    4.00                        fadda	h0, p7, h0, z31.h
 # CHECK-NEXT:  4      8     2.00                        fadda	s0, p7, s0, z31.s
-# CHECK-NEXT:  1      2     0.50                        faddp	z0.h, p0/m, z0.h, z1.h
-# CHECK-NEXT:  1      2     0.50                        faddp	z29.s, p3/m, z29.s, z30.s
-# CHECK-NEXT:  1      2     0.50                        faddp	z31.d, p7/m, z31.d, z30.d
+# CHECK-NEXT:  1      3     0.50                        faddp	z0.h, p0/m, z0.h, z1.h
+# CHECK-NEXT:  1      3     0.50                        faddp	z29.s, p3/m, z29.s, z30.s
+# CHECK-NEXT:  1      3     0.50                        faddp	z31.d, p7/m, z31.d, z30.d
 # CHECK-NEXT:  1      2     0.50                        faddv	d0, p7, z31.d
 # CHECK-NEXT:  3      6     1.50                        faddv	h0, p7, z31.h
 # CHECK-NEXT:  2      4     1.00                        faddv	s0, p7, z31.s

From a5342d5fe56ccf08f843a71b931dedcf09c2b4ca Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 13 Nov 2025 10:46:38 +0000
Subject: [PATCH 24/29] Revert "[DAG] Fold (umin (sub a b) a) -> (usubo a b);
 (select usubo.1 a usubo.0)" (#167854)

Reverts llvm/llvm-project#161651 due to downstream bad codegen reports
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  19 ---
 .../umin-sub-to-usubo-select-combine.ll       | 151 -----------------
 .../X86/umin-sub-to-usubo-select-combine.ll   | 156 ------------------
 3 files changed, 326 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
 delete mode 100644 llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d9d3a3ec01757..df353c4d91b1a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6219,25 +6219,6 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
                                         SDLoc(N), VT, N0, N1))
     return SD;
 
-  if (TLI.isOperationLegalOrCustom(ISD::USUBO, VT) &&
-      !TLI.isOperationLegalOrCustom(ISD::UMIN, VT)) {
-    SDValue B;
-
-    // (umin (sub a, b), a) -> (usubo a, b); (select usubo.1, a, usubo.0)
-    if (sd_match(N0, m_Sub(m_Specific(N1), m_Value(B)))) {
-      SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT));
-      SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N1, B);
-      return DAG.getSelect(DL, VT, USO.getValue(1), N1, USO.getValue(0));
-    }
-
-    // (umin a, (sub a, b)) -> (usubo a, b); (select usubo.1, a, usubo.0)
-    if (sd_match(N1, m_Sub(m_Specific(N0), m_Value(B)))) {
-      SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT));
-      SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N0, B);
-      return DAG.getSelect(DL, VT, USO.getValue(1), N0, USO.getValue(0));
-    }
-  }
-
   // Simplify the operands using demanded-bits information.
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
diff --git a/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
deleted file mode 100644
index fe3eee06db65e..0000000000000
--- a/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
+++ /dev/null
@@ -1,151 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
-
-; GitHub issue #161036
-
-; Positive test : umin(sub(a,b),a) with scalar types should be folded
-define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) {
-; CHECK-LABEL: underflow_compare_fold_i64
-; CHECK-LABEL: %bb.0:
-; CHECK-NEXT:  subs x8, x0, x1
-; CHECK-NEXT:  csel x0, x0, x8, lo
-; CHECK-NEXT:  ret
-  %sub = sub i64 %a, %b
-  %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
-  ret i64 %cond
-}
-
-; Positive test : umin(a,sub(a,b)) with scalar types should be folded
-define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) {
-; CHECK-LABEL: underflow_compare_fold_i64_commute
-; CHECK-LABEL: %bb.0:
-; CHECK-NEXT:  subs x8, x0, x1
-; CHECK-NEXT:  csel x0, x0, x8, lo
-; CHECK-NEXT:  ret
-  %sub = sub i64 %a, %b
-  %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
-  ret i64 %cond
-}
-
-; Positive test : multi-use is OK since the sub instruction still runs once
-define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
-; CHECK-LABEL: underflow_compare_fold_i64_multi_use
-; CHECK-LABEL: %bb.0:
-; CHECK-NEXT:  subs x8, x0, x1
-; CHECK-NEXT:  csel x0, x0, x8, lo
-; CHECK-NEXT:  str	x8, [x2]
-; CHECK-NEXT:  ret
-  %sub = sub i64 %a, %b
-  store i64 %sub, ptr addrspace(1) %ptr
-  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
-  ret i64 %cond
-}
-
-; Positive test : i32
-define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: underflow_compare_fold_i32
-; CHECK-LABEL: %bb.0:
-; CHECK-NEXT:  subs w8, w0, w1
-; CHECK-NEXT:  csel w0, w0, w8, lo
-; CHECK-NEXT:  ret
-  %sub = sub i32 %a, %b
-  %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a)
-  ret i32 %cond
-}
-
-; Positive test : i32
-define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) {
-; CHECK-LABEL: underflow_compare_fold_i32_commute
-; CHECK-LABEL: %bb.0:
-; CHECK-NEXT:  subs w8, w0, w1
-; CHECK-NEXT:  csel w0, w0, w8, lo
-; CHECK-NEXT:  ret
-  %sub = sub i32 %a, %b
-  %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub)
-  ret i32 %cond
-}
-
-; Positive test : i32
-define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
-; CHECK-LABEL: underflow_compare_fold_i32_multi_use
-; CHECK-LABEL: %bb.0:
-; CHECK-NEXT:  subs w8, w0, w1
-; CHECK-NEXT:  csel w0, w0, w8, lo
-; CHECK-NEXT:  str	w8, [x2]
-; CHECK-NEXT:  ret
-  %sub = sub i32 %a, %b
-  store i32 %sub, ptr addrspace(1) %ptr
-  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
-  ret i32 %cond
-}
-
-; Negative test : i16
-define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) {
-; CHECK-LABEL: underflow_compare_fold_i16
-; CHECK-LABEL: %bb.0:
-; CHECK-LABEL: sub w8, w0, w1
-; CHECK-LABEL: and w9, w0, #0xffff
-; CHECK-LABEL: and w8, w8, #0xffff
-; CHECK-LABEL: cmp w8, w9
-; CHECK-LABEL: csel w0, w8, w9, lo
-; CHECK-LABEL: ret
-  %sub = sub i16 %a, %b
-  %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a)
-  ret i16 %cond
-}
-
-; Negative test : i16
-define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) {
-; CHECK-LABEL: underflow_compare_fold_i16_commute
-; CHECK-LABEL: %bb.0:
-; CHECK-LABEL: sub w8, w0, w1
-; CHECK-LABEL: and w9, w0, #0xffff
-; CHECK-LABEL: and w8, w8, #0xffff
-; CHECK-LABEL: cmp w9, w8
-; CHECK-LABEL: csel w0, w9, w8, lo
-; CHECK-LABEL: ret
-  %sub = sub i16 %a, %b
-  %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub)
-  ret i16 %cond
-}
-
-; Negative test : i16
-define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) {
-; CHECK-LABEL: underflow_compare_fold_i16_multi_use
-; CHECK-LABEL: %bb.0:
-; CHECK-LABEL: sub w8, w0, w1
-; CHECK-LABEL: and w9, w0, #0xffff
-; CHECK-LABEL: and w10, w8, #0xffff
-; CHECK-LABEL: strh w8, [x2]
-; CHECK-LABEL: cmp w10, w9
-; CHECK-LABEL: csel w0, w10, w9, lo
-; CHECK-LABEL: ret
-  %sub = sub i16 %a, %b
-  store i16 %sub, ptr addrspace(1) %ptr
-  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
-  ret i16 %cond
-}
-
-; Negative test, vector types : umin(sub(a,b),a) but with vectors
-define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: underflow_compare_dontfold_vectors
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: sub v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: umin v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ret
-  %sub = sub <16 x i8> %a, %b
-  %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
-  ret <16 x i8> %cond
-}
-
-; Negative test, pattern mismatch : umin(add(a,b),a)
-define i64 @umin_add(i64 %a, i64 %b) {
-; CHECK-LABEL: umin_add
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: add x8, x0, x1
-; CHECK-NEXT: cmp x8, x0
-; CHECK-NEXT: csel x0, x8, x0, lo
-; CHECK-NEXT: ret
-  %add = add i64 %a, %b
-  %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
-  ret i64 %cond
-}
diff --git a/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
deleted file mode 100644
index e9756b411eb2c..0000000000000
--- a/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
+++ /dev/null
@@ -1,156 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
-
-; GitHub issue #161036
-
-; Positive test : umin(sub(a,b),a) with scalar types should be folded
-define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) {
-; CHECK-LABEL: underflow_compare_fold_i64
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movq    %rdi, %rax
-; CHECK-NEXT: subq    %rsi, %rax
-; CHECK-NEXT: cmovbq  %rdi, %rax
-; CHECK-NEXT: retq
-  %sub = sub i64 %a, %b
-  %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
-  ret i64 %cond
-}
-
-; Positive test : umin(a,sub(a,b)) with scalar types should be folded
-define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) {
-; CHECK-LABEL: underflow_compare_fold_i64_commute
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movq    %rdi, %rax
-; CHECK-NEXT: subq    %rsi, %rax
-; CHECK-NEXT: cmovbq  %rdi, %rax
-; CHECK-NEXT: retq
-  %sub = sub i64 %a, %b
-  %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
-  ret i64 %cond
-}
-
-; Positive test : multi-use is OK since the sub instruction still runs once
-define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
-; CHECK-LABEL: underflow_compare_fold_i64_multi_use
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movq    %rdi, %rax
-; CHECK-NEXT: subq    %rsi, %rax
-; CHECK-NEXT: movq    %rax, (%rdx)
-; CHECK-NEXT: cmovbq  %rdi, %rax
-; CHECK-NEXT: retq
-  %sub = sub i64 %a, %b
-  store i64 %sub, ptr addrspace(1) %ptr
-  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
-  ret i64 %cond
-}
-
-; Positive test : i32
-define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: underflow_compare_fold_i32
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movl    %edi, %eax
-; CHECK-NEXT: subl    %esi, %eax
-; CHECK-NEXT: cmovbl  %edi, %eax
-; CHECK-NEXT: retq
-  %sub = sub i32 %a, %b
-  %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a)
-  ret i32 %cond
-}
-
-; Positive test : i32
-define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) {
-; CHECK-LABEL: underflow_compare_fold_i32_commute
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movl    %edi, %eax
-; CHECK-NEXT: subl    %esi, %eax
-; CHECK-NEXT: cmovbl  %edi, %eax
-; CHECK-NEXT: retq
-  %sub = sub i32 %a, %b
-  %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub)
-  ret i32 %cond
-}
-
-; Positive test : i32
-define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
-; CHECK-LABEL: underflow_compare_fold_i32_multi_use
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movl    %edi, %eax
-; CHECK-NEXT: subl    %esi, %eax
-; CHECK-NEXT: movl    %eax, (%rdx)
-; CHECK-NEXT: cmovbl  %edi, %eax
-; CHECK-NEXT: retq
-  %sub = sub i32 %a, %b
-  store i32 %sub, ptr addrspace(1) %ptr
-  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
-  ret i32 %cond
-}
-
-; Positive test : i16
-define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) {
-; CHECK-LABEL: underflow_compare_fold_i16
-; CHECK-LABEL: %bb.0:
-; CHECK-NEXT: movl    %edi, %eax
-; CHECK-NEXT: subw    %si,  %ax
-; CHECK-NEXT: cmovbl  %edi, %eax
-; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT: retq
-  %sub = sub i16 %a, %b
-  %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a)
-  ret i16 %cond
-}
-
-; Positive test : i16
-define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) {
-; CHECK-LABEL: underflow_compare_fold_i16_commute
-; CHECK-LABEL: %bb.0:
-; CHECK-NEXT: movl    %edi, %eax
-; CHECK-NEXT: subw    %si,  %ax
-; CHECK-NEXT: cmovbl  %edi, %eax
-; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT: retq
-  %sub = sub i16 %a, %b
-  %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub)
-  ret i16 %cond
-}
-
-; Positive test : i16
-define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) {
-; CHECK-LABEL: underflow_compare_fold_i16_multi_use
-; CHECK-LABEL: %bb.0:
-; CHECK-NEXT: movl    %edi, %eax
-; CHECK-NEXT: subw    %si,  %ax
-; CHECK-NEXT: movw    %ax,  (%rdx)
-; CHECK-NEXT: cmovbl  %edi, %eax
-; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT: retq
-  %sub = sub i16 %a, %b
-  store i16 %sub, ptr addrspace(1) %ptr
-  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
-  ret i16 %cond
-}
-
-
-; Negative test, vector types : umin(sub(a,b),a) but with vectors
-define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: underflow_compare_dontfold_vectors
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: psubb %xmm1, %xmm2
-; CHECK-NEXT: pminub %xmm2, %xmm0
-; CHECK-NEXT: retq
-  %sub = sub <16 x i8> %a, %b
-  %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
-  ret <16 x i8> %cond
-}
-
-; Negative test, pattern mismatch : umin(add(a,b),a)
-define i64 @umin_add(i64 %a, i64 %b) {
-; CHECK-LABEL: umin_add
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: leaq (%rsi,%rdi), %rax
-; CHECK-NEXT: cmpq %rdi, %rax
-; CHECK-NEXT: cmovaeq %rdi, %rax
-; CHECK-NEXT: retq
-  %add = add i64 %a, %b
-  %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
-  ret i64 %cond
-}

From 5fa3ccb04fd6f19996449509233bbd6d14d14137 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Thu, 13 Nov 2025 10:50:29 +0000
Subject: [PATCH 25/29] [AArch64] Use SVE fdot for partial.reduce.fadd for NEON
 types. (#167856)

We only seem to use the SVE fdot for fixed-length vector types when they
are larger than 128bits, whereas we can also use them for 128bits
vectors if SVE2p1/SME2 is available.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  3 +
 .../AArch64/sve2p1-fixed-length-fdot.ll       | 68 ++++++++++++++++---
 2 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eaa10ef031989..656200ba8191c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1921,6 +1921,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
       setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::nxv4f32,
                                 MVT::nxv8f16, Legal);
+      // We can use SVE2p1 fdot to emulate the fixed-length variant.
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::v4f32,
+                                MVT::v8f16, Custom);
     }
   }
 
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
index 89216ce2cb72b..864c66caf5f6c 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
@@ -4,6 +4,43 @@
 
 target triple = "aarch64-linux-gnu"
 
+define void @fdot_v4f32(ptr %accptr, ptr %aptr, ptr %bptr) {
+; SVE2-LABEL: fdot_v4f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ldr q0, [x1]
+; SVE2-NEXT:    ldr q1, [x2]
+; SVE2-NEXT:    fcvtl v2.4s, v0.4h
+; SVE2-NEXT:    fcvtl v3.4s, v1.4h
+; SVE2-NEXT:    fcvtl2 v0.4s, v0.8h
+; SVE2-NEXT:    fcvtl2 v1.4s, v1.8h
+; SVE2-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; SVE2-NEXT:    ldr q3, [x0]
+; SVE2-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; SVE2-NEXT:    fadd v1.4s, v3.4s, v2.4s
+; SVE2-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; SVE2-NEXT:    str q0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_v4f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ldr q0, [x0]
+; SVE2P1-NEXT:    ldr q1, [x1]
+; SVE2P1-NEXT:    ldr q2, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    str q0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <4 x float>, ptr %accptr
+  %a = load <8 x half>, ptr %aptr
+  %b = load <8 x half>, ptr %bptr
+  %a.wide = fpext <8 x half> %a to <8 x float>
+  %b.wide = fpext <8 x half> %b to <8 x float>
+  %mult = fmul <8 x float> %a.wide, %b.wide
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
+  store <4 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
 define void @fdot_wide_v8f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,0) {
 ; SVE2-LABEL: fdot_wide_v8f32:
 ; SVE2:       // %bb.0: // %entry
@@ -177,17 +214,26 @@ entry:
 }
 
 define <4 x float> @fixed_fdot_wide(<4 x float> %acc, <8 x half> %a, <8 x half> %b) {
-; CHECK-LABEL: fixed_fdot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-NEXT:    fcvtl v4.4s, v2.4h
-; CHECK-NEXT:    fcvtl2 v1.4s, v1.8h
-; CHECK-NEXT:    fcvtl2 v2.4s, v2.8h
-; CHECK-NEXT:    fmul v3.4s, v3.4s, v4.4s
-; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; SVE2-LABEL: fixed_fdot_wide:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    fcvtl v3.4s, v1.4h
+; SVE2-NEXT:    fcvtl v4.4s, v2.4h
+; SVE2-NEXT:    fcvtl2 v1.4s, v1.8h
+; SVE2-NEXT:    fcvtl2 v2.4s, v2.8h
+; SVE2-NEXT:    fmul v3.4s, v3.4s, v4.4s
+; SVE2-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; SVE2-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; SVE2-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fixed_fdot_wide:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2P1-NEXT:    // kill: def $q2 killed $q2 def $z2
+; SVE2P1-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE2P1-NEXT:    ret
 entry:
   %a.wide = fpext <8 x half> %a to <8 x float>
   %b.wide = fpext <8 x half> %b to <8 x float>

From 20034baca7f047b58b2da1f211aec8447a161560 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miko=C5=82aj=20Pir=C3=B3g?= <mikolaj.maciej.pirog@intel.com>
Date: Thu, 13 Nov 2025 11:56:46 +0100
Subject: [PATCH 26/29] [X86] Don't rely on global contraction flag (#167252)

As in title. See here for more context:
https://discourse.llvm.org/t/allowfpopfusion-vs-sdnodeflags-hasallowcontract/80909

Also add a warning in llc when global contract flag is encountered on x86.
Remove global contract from last x86 test
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 14 ++-----
 .../CodeGen/X86/llc-fp-contract-warning.ll    | 12 ++++++
 llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll    | 42 +++++++++----------
 llvm/tools/llc/llc.cpp                        |  6 +++
 4 files changed, 42 insertions(+), 32 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/llc-fp-contract-warning.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6483e07afadee..0a6d1c9c021fc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8437,9 +8437,7 @@ static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
   // DAGCombiner::visitFADDForFMACombine. It would be good to have one
   // function that would answer if it is Ok to fuse MUL + ADD to FMADD
   // or MUL + ADDSUB to FMADDSUB.
-  const TargetOptions &Options = DAG.getTarget().Options;
   bool AllowFusion =
-      Options.AllowFPOpFusion == FPOpFusion::Fast ||
       (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
   if (!AllowFusion)
     return false;
@@ -54165,11 +54163,6 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
 //  FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
 static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget) {
-  auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
-    return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
-           Flags.hasAllowContract();
-  };
-
   auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
     return DAG.getTarget().Options.NoSignedZerosFPMath ||
            Flags.hasNoSignedZeros();
@@ -54182,7 +54175,7 @@ static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
   };
 
   if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
-      !AllowContract(N->getFlags()))
+      !N->getFlags().hasAllowContract())
     return SDValue();
 
   EVT VT = N->getValueType(0);
@@ -54193,14 +54186,13 @@ static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
   SDValue RHS = N->getOperand(1);
   bool IsConj;
   SDValue FAddOp1, MulOp0, MulOp1;
-  auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
-                       &IsVectorAllNegativeZero,
+  auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &IsVectorAllNegativeZero,
                        &HasNoSignedZero](SDValue N) -> bool {
     if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
       return false;
     SDValue Op0 = N.getOperand(0);
     unsigned Opcode = Op0.getOpcode();
-    if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
+    if (Op0.hasOneUse() && Op0->getFlags().hasAllowContract()) {
       if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
         MulOp0 = Op0.getOperand(0);
         MulOp1 = Op0.getOperand(1);
diff --git a/llvm/test/CodeGen/X86/llc-fp-contract-warning.ll b/llvm/test/CodeGen/X86/llc-fp-contract-warning.ll
new file mode 100644
index 0000000000000..2802593c733e0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/llc-fp-contract-warning.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast 2>&1 | grep "X86 backend ignores --fp-contract"
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=off 2>&1 | grep "X86 backend ignores --fp-contract"
+
+; on, as a default setting that's passed to backend when no --fp-contract option is specified, is not diagnosed.
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=on 2>&1 | grep -v "X86 backend ignores --fp-contract"
+
+define float @foo(float %f) {
+  %res = fadd float %f, %f
+  ret float %res
+}
+
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
index 18588aada145c..fade0f7d1d130 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -fp-contract=fast < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s
 
 declare float @llvm.sqrt.f32(float) #2
 
@@ -24,17 +24,17 @@ define float @sqrt_ieee_ninf(float %f) #0 {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:fr32 = COPY $xmm0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf afn VRSQRTSSr killed [[DEF]], [[COPY]]
-  ; CHECK-NEXT:   [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf contract afn VRSQRTSSr killed [[DEF]], [[COPY]]
+  ; CHECK-NEXT:   [[VMULSSrr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
   ; CHECK-NEXT:   [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool)
-  ; CHECK-NEXT:   [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr
   ; CHECK-NEXT:   [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool)
-  ; CHECK-NEXT:   [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr
-  ; CHECK-NEXT:   [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr
-  ; CHECK-NEXT:   [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr
-  ; CHECK-NEXT:   [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr
-  ; CHECK-NEXT:   [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr
-  ; CHECK-NEXT:   [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VMULSSrr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VMULSSrr2:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VMULSSrr3:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VMULSSrr4:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VMULSSrr5:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr128 = COPY killed [[VMULSSrr5]]
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vr128 = COPY [[COPY]]
   ; CHECK-NEXT:   [[VPBROADCASTDrm:%[0-9]+]]:vr128 = VPBROADCASTDrm $rip, 1, $noreg, %const.2, $noreg :: (load (s32) from constant-pool)
@@ -46,7 +46,7 @@ define float @sqrt_ieee_ninf(float %f) #0 {
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:fr32 = COPY killed [[VPANDNrr]]
   ; CHECK-NEXT:   $xmm0 = COPY [[COPY5]]
   ; CHECK-NEXT:   RET 0, $xmm0
-  %call = tail call ninf afn float @llvm.sqrt.f32(float %f)
+  %call = tail call ninf afn contract float @llvm.sqrt.f32(float %f)
   ret float %call
 }
 
@@ -71,17 +71,17 @@ define float @sqrt_daz_ninf(float %f) #1 {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:fr32 = COPY $xmm0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf afn VRSQRTSSr killed [[DEF]], [[COPY]]
-  ; CHECK-NEXT:   [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf contract afn VRSQRTSSr killed [[DEF]], [[COPY]]
+  ; CHECK-NEXT:   [[VMULSSrr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
   ; CHECK-NEXT:   [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool)
-  ; CHECK-NEXT:   [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr
   ; CHECK-NEXT:   [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool)
-  ; CHECK-NEXT:   [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr
-  ; CHECK-NEXT:   [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr
-  ; CHECK-NEXT:   [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr
-  ; CHECK-NEXT:   [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr
-  ; CHECK-NEXT:   [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr
-  ; CHECK-NEXT:   [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VMULSSrr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VMULSSrr2:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VMULSSrr3:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VMULSSrr4:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr
+  ; CHECK-NEXT:   [[VMULSSrr5:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr128 = COPY killed [[VMULSSrr5]]
   ; CHECK-NEXT:   [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS
   ; CHECK-NEXT:   [[VCMPSSrri:%[0-9]+]]:fr32 = nofpexcept VCMPSSrri [[COPY]], killed [[FsFLD0SS]], 0, implicit $mxcsr
@@ -90,7 +90,7 @@ define float @sqrt_daz_ninf(float %f) #1 {
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:fr32 = COPY killed [[VPANDNrr]]
   ; CHECK-NEXT:   $xmm0 = COPY [[COPY3]]
   ; CHECK-NEXT:   RET 0, $xmm0
-  %call = tail call ninf afn float @llvm.sqrt.f32(float %f)
+  %call = tail call ninf afn contract float @llvm.sqrt.f32(float %f)
   ret float %call
 }
 
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index dc2f878830863..92906b44e0818 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -604,6 +604,12 @@ static int compileModule(char **argv, LLVMContext &Context,
                     InputFilename);
     }
 
+    if (TheTriple.isX86() &&
+        codegen::getFuseFPOps() != FPOpFusion::FPOpFusionMode::Standard)
+      WithColor::warning(errs(), argv[0])
+          << "X86 backend ignores --fp-contract setting; use IR fast-math "
+             "flags instead.";
+
     Options.BinutilsVersion =
         TargetMachine::parseBinutilsVersion(BinutilsVersion);
     Options.MCOptions.ShowMCEncoding = ShowMCEncoding;

From 59c01cc8bb37dcff2d2efc2ddf287b6b89438200 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 13 Nov 2025 11:36:22 +0000
Subject: [PATCH 27/29] [Headers][X86] Update FMA3/FMA4 scalar intrinsics to
 use __builtin_elementwise_fma and support constexpr (#154731)

Now that #152455 is done, we can make all the scalar fma intrinsics to
wrap __builtin_elementwise_fma, which also allows constexpr

The main difference is that FMA4 intrinsics guarantee that the upper
elements are zero, while FMA3 passes through the destination register
elements like older scalar instructions

Fixes #154555
---
 clang/include/clang/Basic/BuiltinsX86.td |  10 --
 clang/lib/CodeGen/TargetBuiltins/X86.cpp |   6 --
 clang/lib/Headers/fma4intrin.h           |  56 +++++------
 clang/lib/Headers/fmaintrin.h            |  64 ++++++-------
 clang/test/CodeGen/X86/fma-builtins.c    | 100 ++++++++++---------
 clang/test/CodeGen/X86/fma4-builtins.c   | 116 ++++++++++++++---------
 6 files changed, 180 insertions(+), 172 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index cb08e2107f072..14c7d636ad51e 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -866,16 +866,6 @@ let Features = "sha", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
   def sha256msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
 }
 
-let Features = "fma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vfmaddss3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
-  def vfmaddsd3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
-}
-
-let Features = "fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vfmaddss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
-  def vfmaddsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
-}
-
 let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def vfmaddsubps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
   def vfmaddsubpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index 2381b2e7cf2cf..00c8a1cf16e31 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -1028,16 +1028,10 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
     return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
 
-  case X86::BI__builtin_ia32_vfmaddss3:
-  case X86::BI__builtin_ia32_vfmaddsd3:
   case X86::BI__builtin_ia32_vfmaddsh3_mask:
   case X86::BI__builtin_ia32_vfmaddss3_mask:
   case X86::BI__builtin_ia32_vfmaddsd3_mask:
     return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
-  case X86::BI__builtin_ia32_vfmaddss:
-  case X86::BI__builtin_ia32_vfmaddsd:
-    return EmitScalarFMAExpr(*this, E, Ops,
-                             Constant::getNullValue(Ops[0]->getType()));
   case X86::BI__builtin_ia32_vfmaddsh3_maskz:
   case X86::BI__builtin_ia32_vfmaddss3_maskz:
   case X86::BI__builtin_ia32_vfmaddsd3_maskz:
diff --git a/clang/lib/Headers/fma4intrin.h b/clang/lib/Headers/fma4intrin.h
index e0a0e4c968950..20b8030b77adc 100644
--- a/clang/lib/Headers/fma4intrin.h
+++ b/clang/lib/Headers/fma4intrin.h
@@ -40,16 +40,14 @@ _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) {
                                             (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) {
+  return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) {
+  return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -64,16 +62,14 @@ _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) {
                                             -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) {
+  return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) {
+  return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -88,16 +84,14 @@ _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) {
                                             (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) {
+  return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) {
+  return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -112,16 +106,14 @@ _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
                                             -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
+  return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
+  return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h
index c51009079f8d5..eba527f3604d0 100644
--- a/clang/lib/Headers/fmaintrin.h
+++ b/clang/lib/Headers/fmaintrin.h
@@ -95,10 +95,10 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 ///    32 bits.
 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 ///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
+  return __A;
 }
 
 /// Computes a scalar multiply-add of the double-precision values in the
@@ -124,10 +124,10 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 ///    64 bits.
 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 ///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
+  return __A;
 }
 
 /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
@@ -195,10 +195,10 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 ///   32 bits.
 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
+  return __A;
 }
 
 /// Computes a scalar multiply-subtract of the double-precision values in
@@ -224,10 +224,10 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 ///    64 bits.
 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
+  return __A;
 }
 
 /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
@@ -295,10 +295,10 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 ///    32 bits.
 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
+  return __A;
 }
 
 /// Computes a scalar negated multiply-add of the double-precision values
@@ -324,10 +324,10 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 ///    64 bits.
 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
+  return __A;
 }
 
 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
@@ -395,10 +395,10 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 ///    32 bits.
 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
+  return __A;
 }
 
 /// Computes a scalar negated multiply-subtract of the double-precision
@@ -424,10 +424,10 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 ///    64 bits.
 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
+  return __A;
 }
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
diff --git a/clang/test/CodeGen/X86/fma-builtins.c b/clang/test/CodeGen/X86/fma-builtins.c
index 5445e50d4ecea..ea93bca2bad65 100644
--- a/clang/test/CodeGen/X86/fma-builtins.c
+++ b/clang/test/CodeGen/X86/fma-builtins.c
@@ -28,23 +28,25 @@ TEST_CONSTEXPR(match_m128d(_mm_fmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0,
 
 __m128 test_mm_fmadd_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmadd_ss
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
   // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_fmadd_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_fmadd_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -7.0f, 1.0f, -2.0f, -0.0f));
 
 __m128d test_mm_fmadd_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fmadd_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_fmadd_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_fmadd_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -12.0, 1.0));
 
 __m128 test_mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmsub_ps
@@ -64,25 +66,27 @@ TEST_CONSTEXPR(match_m128d(_mm_fmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0,
 
 __m128 test_mm_fmsub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmsub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float [[NEG]])
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_fmsub_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_fmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -9.0f, 1.0f, -2.0f, -0.0f));
 
 __m128d test_mm_fmsub_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fmsub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double [[NEG]])
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_fmsub_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_fmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 4.0, 1.0));
 
 __m128 test_mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmadd_ps
@@ -102,25 +106,27 @@ TEST_CONSTEXPR(match_m128d(_mm_fnmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0
 
 __m128 test_mm_fnmadd_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmadd_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float [[NEG]], float %{{.*}})
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_fnmadd_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_fnmadd_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 9.0f, 1.0f, -2.0f, -0.0f));
 
 __m128d test_mm_fnmadd_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fnmadd_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double [[NEG]], double %{{.*}})
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_fnmadd_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_fnmadd_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -4.0, 1.0));
 
 __m128 test_mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmsub_ps
@@ -142,27 +148,29 @@ TEST_CONSTEXPR(match_m128d(_mm_fnmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0
 
 __m128 test_mm_fnmsub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmsub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG2:%.+]] = fneg float %{{.+}}
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float [[NEG]], float [[NEG2]])
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_fnmsub_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_fnmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 7.0f, 1.0f, -2.0f, -0.0f));
 
 __m128d test_mm_fnmsub_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fnmsub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG2:%.+]] = fneg double %{{.+}}
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double [[NEG]], double [[NEG2]])
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_fnmsub_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_fnmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 12.0, 1.0));
 
 __m128 test_mm_fmaddsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmaddsub_ps
diff --git a/clang/test/CodeGen/X86/fma4-builtins.c b/clang/test/CodeGen/X86/fma4-builtins.c
index fb449d5da2591..949519864512b 100644
--- a/clang/test/CodeGen/X86/fma4-builtins.c
+++ b/clang/test/CodeGen/X86/fma4-builtins.c
@@ -28,23 +28,29 @@ TEST_CONSTEXPR(match_m128d(_mm_macc_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0,
 
 __m128 test_mm_macc_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_macc_ss
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
   // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
-  // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0
+  // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
   return _mm_macc_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_macc_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -7.0f, 0.0f, 0.0f, 0.0f));
 
 __m128d test_mm_macc_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_macc_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
-  // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0
+  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1
   return _mm_macc_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_macc_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -12.0, 0.0));
 
 __m128 test_mm_msub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_msub_ps
@@ -64,25 +70,31 @@ TEST_CONSTEXPR(match_m128d(_mm_msub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0,
 
 __m128 test_mm_msub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_msub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float [[C]])
-  // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float [[NEG]])
+  // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
   return _mm_msub_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_msub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -9.0f, 0.0f, 0.0f, 0.0f));
 
 __m128d test_mm_msub_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_msub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double [[C]])
-  // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double [[NEG]])
+  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1
   return _mm_msub_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_msub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 4.0, 0.0));
 
 __m128 test_mm_nmacc_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_nmacc_ps
@@ -102,25 +114,31 @@ TEST_CONSTEXPR(match_m128d(_mm_nmacc_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0,
 
 __m128 test_mm_nmacc_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_nmacc_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: call float @llvm.fma.f32(float [[A]], float %{{.*}}, float %{{.*}})
-  // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: call float @llvm.fma.f32(float [[NEG]], float %{{.*}}, float %{{.*}})
+  // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
   return _mm_nmacc_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_nmacc_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 9.0f, 0.0f, 0.0f, 0.0f));
 
 __m128d test_mm_nmacc_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_nmacc_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: call double @llvm.fma.f64(double [[A]], double %{{.*}}, double %{{.*}})
-  // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: call double @llvm.fma.f64(double [[NEG]], double %{{.*}}, double %{{.*}})
+  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1
   return _mm_nmacc_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_nmacc_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -4.0, 0.0));
 
 __m128 test_mm_nmsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_nmsub_ps
@@ -142,27 +160,33 @@ TEST_CONSTEXPR(match_m128d(_mm_nmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0,
 
 __m128 test_mm_nmsub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_nmsub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
-  // CHECK: call float @llvm.fma.f32(float [[A]], float %{{.*}}, float [[C]])
-  // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG2:%.+]] = fneg float %{{.+}}
+  // CHECK: call float @llvm.fma.f32(float [[NEG]], float %{{.*}}, float [[NEG2]])
+  // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
   return _mm_nmsub_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_nmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 7.0f, 0.0f, 0.0f, 0.0f));
 
 __m128d test_mm_nmsub_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_nmsub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
-  // CHECK: call double @llvm.fma.f64(double [[A]], double %{{.*}}, double [[C]])
-  // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG2:%.+]] = fneg double %{{.+}}
+  // CHECK: call double @llvm.fma.f64(double [[NEG]], double %{{.*}}, double [[NEG2]])
+  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1
   return _mm_nmsub_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_nmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 12.0, 0.0));
 
 __m128 test_mm_maddsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_maddsub_ps

From 876114f0f799cffd96a5a41ffcd9f8477405748e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 13 Nov 2025 11:40:10 +0000
Subject: [PATCH 28/29] [X86] Add widenBuildVector to create a wider build
 vector if the scalars are mergeable (#167667)

See if each pair of scalar operands of a build vector can be freely
merged together - typically if they've been split for some reason by
legalization.

If we can create a new build vector node with double the scalar size,
but half the element count - reducing codegen complexity and potentially
allowing further optimization.

I did look at performing this generically in DAGCombine, but we don't
have as much control over when a legal build vector can be folded -
another generic fold would be to handle this on insert_vector_elt pairs,
but again legality checks could be limiting.

Fixes #167498
---
 llvm/lib/Target/X86/X86ISelLowering.cpp   |  52 +++
 llvm/test/CodeGen/X86/build-vector-128.ll | 384 ++++------------------
 llvm/test/CodeGen/X86/setcc-wide-types.ll |  44 +--
 3 files changed, 141 insertions(+), 339 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0a6d1c9c021fc..a2f505b9cff8a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8865,6 +8865,56 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL,
   return SDValue();
 }
 
+/// Widen a BUILD_VECTOR if the scalar operands are freely mergeable.
+static SDValue widenBuildVector(BuildVectorSDNode *BVOp, SDLoc const &DL,
+                                X86Subtarget const &Subtarget,
+                                SelectionDAG &DAG) {
+  using namespace SDPatternMatch;
+  MVT VT = BVOp->getSimpleValueType(0);
+  MVT SVT = VT.getScalarType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltBits = SVT.getSizeInBits();
+
+  if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
+    return SDValue();
+
+  unsigned WideBits = 2 * EltBits;
+  MVT WideSVT = MVT::getIntegerVT(WideBits);
+  MVT WideVT = MVT::getVectorVT(WideSVT, NumElts / 2);
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(WideSVT))
+    return SDValue();
+
+  SmallVector<SDValue, 8> WideOps;
+  for (unsigned I = 0; I != NumElts; I += 2) {
+    SDValue Op0 = BVOp->getOperand(I + 0);
+    SDValue Op1 = BVOp->getOperand(I + 1);
+
+    if (Op0.isUndef() && Op1.isUndef()) {
+      WideOps.push_back(DAG.getUNDEF(WideSVT));
+      continue;
+    }
+
+    // TODO: Constant repacking?
+
+    // Merge scalars that have been split from the same source.
+    SDValue X, Y;
+    if (sd_match(Op0, m_Trunc(m_Value(X))) &&
+        sd_match(Op1, m_Trunc(m_Srl(m_Value(Y), m_SpecificInt(EltBits)))) &&
+        peekThroughTruncates(X) == peekThroughTruncates(Y) &&
+        X.getValueType().bitsGE(WideSVT)) {
+      if (X.getValueType().bitsGT(WideSVT))
+        X = DAG.getNode(ISD::TRUNCATE, DL, WideSVT, X);
+      WideOps.push_back(X);
+      continue;
+    }
+
+    return SDValue();
+  }
+
+  assert(WideOps.size() == (NumElts / 2) && "Failed to widen build vector");
+  return DAG.getBitcast(VT, DAG.getBuildVector(WideVT, DL, WideOps));
+}
+
 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
 /// functionality to do this, so it's all zeros, all ones, or some derivation
 /// that is cheap to calculate.
@@ -9335,6 +9385,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     return BitOp;
   if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
     return Blend;
+  if (SDValue WideBV = widenBuildVector(BV, dl, Subtarget, DAG))
+    return WideBV;
 
   unsigned NumZero = ZeroMask.popcount();
   unsigned NumNonZero = NonZeroMask.popcount();
diff --git a/llvm/test/CodeGen/X86/build-vector-128.ll b/llvm/test/CodeGen/X86/build-vector-128.ll
index b8bb417e1860c..59eb776ac365e 100644
--- a/llvm/test/CodeGen/X86/build-vector-128.ll
+++ b/llvm/test/CodeGen/X86/build-vector-128.ll
@@ -418,32 +418,12 @@ define <4 x i32> @test_buildvector_v2i64_split_v4i32(i64 %a0, i64 %a1) nounwind
 ; SSE-32-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    retl
 ;
-; SSE2-64-LABEL: test_buildvector_v2i64_split_v4i32:
-; SSE2-64:       # %bb.0:
-; SSE2-64-NEXT:    movl %edi, %eax
-; SSE2-64-NEXT:    movl %esi, %ecx
-; SSE2-64-NEXT:    shrq $32, %rdi
-; SSE2-64-NEXT:    shrq $32, %rsi
-; SSE2-64-NEXT:    movd %ecx, %xmm1
-; SSE2-64-NEXT:    movd %esi, %xmm0
-; SSE2-64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-64-NEXT:    movd %eax, %xmm0
-; SSE2-64-NEXT:    movd %edi, %xmm2
-; SSE2-64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-64-NEXT:    retq
-;
-; SSE41-64-LABEL: test_buildvector_v2i64_split_v4i32:
-; SSE41-64:       # %bb.0:
-; SSE41-64-NEXT:    movl %edi, %eax
-; SSE41-64-NEXT:    movl %esi, %ecx
-; SSE41-64-NEXT:    shrq $32, %rdi
-; SSE41-64-NEXT:    shrq $32, %rsi
-; SSE41-64-NEXT:    movd %eax, %xmm0
-; SSE41-64-NEXT:    pinsrd $1, %edi, %xmm0
-; SSE41-64-NEXT:    pinsrd $2, %ecx, %xmm0
-; SSE41-64-NEXT:    pinsrd $3, %esi, %xmm0
-; SSE41-64-NEXT:    retq
+; SSE-64-LABEL: test_buildvector_v2i64_split_v4i32:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movq %rsi, %xmm1
+; SSE-64-NEXT:    movq %rdi, %xmm0
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-64-NEXT:    retq
 ;
 ; AVX-32-LABEL: test_buildvector_v2i64_split_v4i32:
 ; AVX-32:       # %bb.0:
@@ -452,14 +432,9 @@ define <4 x i32> @test_buildvector_v2i64_split_v4i32(i64 %a0, i64 %a1) nounwind
 ;
 ; AVX-64-LABEL: test_buildvector_v2i64_split_v4i32:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    movl %edi, %eax
-; AVX-64-NEXT:    movl %esi, %ecx
-; AVX-64-NEXT:    shrq $32, %rdi
-; AVX-64-NEXT:    shrq $32, %rsi
-; AVX-64-NEXT:    vmovd %eax, %xmm0
-; AVX-64-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
-; AVX-64-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX-64-NEXT:    vpinsrd $3, %esi, %xmm0, %xmm0
+; AVX-64-NEXT:    vmovq %rsi, %xmm0
+; AVX-64-NEXT:    vmovq %rdi, %xmm1
+; AVX-64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX-64-NEXT:    retq
   %a0.lo = trunc i64 %a0 to i32
   %a1.lo = trunc i64 %a1 to i32
@@ -475,12 +450,10 @@ define <4 x i32> @test_buildvector_v2i64_split_v4i32(i64 %a0, i64 %a1) nounwind
 }
 
 define <8 x i16> @test_buildvector_v4i32_split_v8i16(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
-; SSE2-32-LABEL: test_buildvector_v4i32_split_v8i16:
-; SSE2-32:       # %bb.0:
-; SSE2-32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE2-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-32-NEXT:    retl
+; SSE-32-LABEL: test_buildvector_v4i32_split_v8i16:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
+; SSE-32-NEXT:    retl
 ;
 ; SSE2-64-LABEL: test_buildvector_v4i32_split_v8i16:
 ; SSE2-64:       # %bb.0:
@@ -493,80 +466,25 @@ define <8 x i16> @test_buildvector_v4i32_split_v8i16(i32 %a0, i32 %a1, i32 %a2,
 ; SSE2-64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-64-NEXT:    retq
 ;
-; SSE41-32-LABEL: test_buildvector_v4i32_split_v8i16:
-; SSE41-32:       # %bb.0:
-; SSE41-32-NEXT:    pushl %esi
-; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SSE41-32-NEXT:    movd %esi, %xmm0
-; SSE41-32-NEXT:    shrl $16, %esi
-; SSE41-32-NEXT:    pinsrw $1, %esi, %xmm0
-; SSE41-32-NEXT:    pinsrw $2, %edx, %xmm0
-; SSE41-32-NEXT:    shrl $16, %edx
-; SSE41-32-NEXT:    pinsrw $3, %edx, %xmm0
-; SSE41-32-NEXT:    pinsrw $4, %ecx, %xmm0
-; SSE41-32-NEXT:    shrl $16, %ecx
-; SSE41-32-NEXT:    pinsrw $5, %ecx, %xmm0
-; SSE41-32-NEXT:    pinsrw $6, %eax, %xmm0
-; SSE41-32-NEXT:    shrl $16, %eax
-; SSE41-32-NEXT:    pinsrw $7, %eax, %xmm0
-; SSE41-32-NEXT:    popl %esi
-; SSE41-32-NEXT:    retl
-;
 ; SSE41-64-LABEL: test_buildvector_v4i32_split_v8i16:
 ; SSE41-64:       # %bb.0:
 ; SSE41-64-NEXT:    movd %edi, %xmm0
-; SSE41-64-NEXT:    shrl $16, %edi
-; SSE41-64-NEXT:    pinsrw $1, %edi, %xmm0
-; SSE41-64-NEXT:    pinsrw $2, %esi, %xmm0
-; SSE41-64-NEXT:    shrl $16, %esi
-; SSE41-64-NEXT:    pinsrw $3, %esi, %xmm0
-; SSE41-64-NEXT:    pinsrw $4, %edx, %xmm0
-; SSE41-64-NEXT:    shrl $16, %edx
-; SSE41-64-NEXT:    pinsrw $5, %edx, %xmm0
-; SSE41-64-NEXT:    pinsrw $6, %ecx, %xmm0
-; SSE41-64-NEXT:    shrl $16, %ecx
-; SSE41-64-NEXT:    pinsrw $7, %ecx, %xmm0
+; SSE41-64-NEXT:    pinsrd $1, %esi, %xmm0
+; SSE41-64-NEXT:    pinsrd $2, %edx, %xmm0
+; SSE41-64-NEXT:    pinsrd $3, %ecx, %xmm0
 ; SSE41-64-NEXT:    retq
 ;
 ; AVX-32-LABEL: test_buildvector_v4i32_split_v8i16:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    pushl %esi
-; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; AVX-32-NEXT:    vmovd %esi, %xmm0
-; AVX-32-NEXT:    shrl $16, %esi
-; AVX-32-NEXT:    vpinsrw $1, %esi, %xmm0, %xmm0
-; AVX-32-NEXT:    vpinsrw $2, %edx, %xmm0, %xmm0
-; AVX-32-NEXT:    shrl $16, %edx
-; AVX-32-NEXT:    vpinsrw $3, %edx, %xmm0, %xmm0
-; AVX-32-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
-; AVX-32-NEXT:    shrl $16, %ecx
-; AVX-32-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
-; AVX-32-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    shrl $16, %eax
-; AVX-32-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    popl %esi
+; AVX-32-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    retl
 ;
 ; AVX-64-LABEL: test_buildvector_v4i32_split_v8i16:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vmovd %edi, %xmm0
-; AVX-64-NEXT:    shrl $16, %edi
-; AVX-64-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
-; AVX-64-NEXT:    vpinsrw $2, %esi, %xmm0, %xmm0
-; AVX-64-NEXT:    shrl $16, %esi
-; AVX-64-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
-; AVX-64-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
-; AVX-64-NEXT:    shrl $16, %edx
-; AVX-64-NEXT:    vpinsrw $5, %edx, %xmm0, %xmm0
-; AVX-64-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX-64-NEXT:    shrl $16, %ecx
-; AVX-64-NEXT:    vpinsrw $7, %ecx, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
 ; AVX-64-NEXT:    retq
   %a0.lo = trunc i32 %a0 to i16
   %a1.lo = trunc i32 %a1 to i16
@@ -594,244 +512,88 @@ define <8 x i16> @test_buildvector_v4i32_split_v8i16(i32 %a0, i32 %a1, i32 %a2,
 define <16 x i8> @test_buildvector_v8i16_split_v16i8(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
 ; SSE2-32-LABEL: test_buildvector_v8i16_split_v16i8:
 ; SSE2-32:       # %bb.0:
-; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE2-32-NEXT:    movd %eax, %xmm1
-; SSE2-32-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-32-NEXT:    psrld $8, %xmm0
-; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE2-32-NEXT:    movd %eax, %xmm0
-; SSE2-32-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-32-NEXT:    psrld $8, %xmm2
-; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE2-32-NEXT:    movd %eax, %xmm2
-; SSE2-32-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-32-NEXT:    psrld $8, %xmm1
-; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE2-32-NEXT:    movd %eax, %xmm1
-; SSE2-32-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-32-NEXT:    psrld $8, %xmm3
-; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-32-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE2-32-NEXT:    movd %eax, %xmm0
-; SSE2-32-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-32-NEXT:    psrld $8, %xmm2
-; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE2-32-NEXT:    movd %eax, %xmm2
-; SSE2-32-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-32-NEXT:    psrld $8, %xmm3
-; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-32-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE2-32-NEXT:    movd %eax, %xmm3
-; SSE2-32-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-32-NEXT:    psrld $8, %xmm0
-; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE2-32-NEXT:    movd %eax, %xmm0
-; SSE2-32-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-32-NEXT:    psrld $8, %xmm4
-; SSE2-32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-32-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-32-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-32-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-32-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-32-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-32-NEXT:    retl
 ;
 ; SSE2-64-LABEL: test_buildvector_v8i16_split_v16i8:
 ; SSE2-64:       # %bb.0:
-; SSE2-64-NEXT:    pushq %rbp
-; SSE2-64-NEXT:    pushq %r15
-; SSE2-64-NEXT:    pushq %r14
-; SSE2-64-NEXT:    pushq %rbx
-; SSE2-64-NEXT:    movzwl %di, %eax
-; SSE2-64-NEXT:    movzwl %si, %r10d
-; SSE2-64-NEXT:    movzwl %dx, %r11d
-; SSE2-64-NEXT:    movzwl %cx, %ebx
-; SSE2-64-NEXT:    movzwl %r8w, %ebp
-; SSE2-64-NEXT:    movzwl %r9w, %r14d
-; SSE2-64-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
-; SSE2-64-NEXT:    movd %r15d, %xmm0
-; SSE2-64-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-64-NEXT:    psrld $8, %xmm1
-; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-64-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
-; SSE2-64-NEXT:    movd %r15d, %xmm2
-; SSE2-64-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-64-NEXT:    psrld $8, %xmm1
-; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-64-NEXT:    movd %r9d, %xmm0
-; SSE2-64-NEXT:    movd %r14d, %xmm1
-; SSE2-64-NEXT:    psrld $8, %xmm1
-; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-64-NEXT:    movd %r8d, %xmm1
-; SSE2-64-NEXT:    movd %ebp, %xmm3
-; SSE2-64-NEXT:    psrld $8, %xmm3
-; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-64-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-64-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-64-NEXT:    movd %ecx, %xmm0
-; SSE2-64-NEXT:    movd %ebx, %xmm2
-; SSE2-64-NEXT:    psrld $8, %xmm2
-; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-64-NEXT:    movd %edx, %xmm2
-; SSE2-64-NEXT:    movd %r11d, %xmm3
-; SSE2-64-NEXT:    psrld $8, %xmm3
-; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-64-NEXT:    movd %r9d, %xmm0
+; SSE2-64-NEXT:    movd %r8d, %xmm2
 ; SSE2-64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-64-NEXT:    movd %ecx, %xmm0
+; SSE2-64-NEXT:    movd %edx, %xmm1
+; SSE2-64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-64-NEXT:    movd %esi, %xmm3
-; SSE2-64-NEXT:    movd %r10d, %xmm0
-; SSE2-64-NEXT:    psrld $8, %xmm0
-; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; SSE2-64-NEXT:    movd %edi, %xmm0
-; SSE2-64-NEXT:    movd %eax, %xmm4
-; SSE2-64-NEXT:    psrld $8, %xmm4
-; SSE2-64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; SSE2-64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-64-NEXT:    popq %rbx
-; SSE2-64-NEXT:    popq %r14
-; SSE2-64-NEXT:    popq %r15
-; SSE2-64-NEXT:    popq %rbp
+; SSE2-64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-64-NEXT:    retq
 ;
 ; SSE41-32-LABEL: test_buildvector_v8i16_split_v16i8:
 ; SSE41-32:       # %bb.0:
-; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE41-32-NEXT:    movd %eax, %xmm0
-; SSE41-32-NEXT:    shrl $8, %eax
-; SSE41-32-NEXT:    pinsrb $1, %eax, %xmm0
-; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE41-32-NEXT:    pinsrb $2, %eax, %xmm0
-; SSE41-32-NEXT:    shrl $8, %eax
-; SSE41-32-NEXT:    pinsrb $3, %eax, %xmm0
-; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE41-32-NEXT:    pinsrb $4, %eax, %xmm0
-; SSE41-32-NEXT:    shrl $8, %eax
-; SSE41-32-NEXT:    pinsrb $5, %eax, %xmm0
-; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE41-32-NEXT:    pinsrb $6, %eax, %xmm0
-; SSE41-32-NEXT:    shrl $8, %eax
-; SSE41-32-NEXT:    pinsrb $7, %eax, %xmm0
-; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE41-32-NEXT:    pinsrb $8, %eax, %xmm0
-; SSE41-32-NEXT:    shrl $8, %eax
-; SSE41-32-NEXT:    pinsrb $9, %eax, %xmm0
-; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE41-32-NEXT:    pinsrb $10, %eax, %xmm0
-; SSE41-32-NEXT:    shrl $8, %eax
-; SSE41-32-NEXT:    pinsrb $11, %eax, %xmm0
-; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE41-32-NEXT:    pinsrb $12, %eax, %xmm0
-; SSE41-32-NEXT:    shrl $8, %eax
-; SSE41-32-NEXT:    pinsrb $13, %eax, %xmm0
-; SSE41-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE41-32-NEXT:    pinsrb $14, %eax, %xmm0
-; SSE41-32-NEXT:    shrl $8, %eax
-; SSE41-32-NEXT:    pinsrb $15, %eax, %xmm0
+; SSE41-32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-32-NEXT:    pinsrw $1, {{[0-9]+}}(%esp), %xmm0
+; SSE41-32-NEXT:    pinsrw $2, {{[0-9]+}}(%esp), %xmm0
+; SSE41-32-NEXT:    pinsrw $3, {{[0-9]+}}(%esp), %xmm0
+; SSE41-32-NEXT:    pinsrw $4, {{[0-9]+}}(%esp), %xmm0
+; SSE41-32-NEXT:    pinsrw $5, {{[0-9]+}}(%esp), %xmm0
+; SSE41-32-NEXT:    pinsrw $6, {{[0-9]+}}(%esp), %xmm0
+; SSE41-32-NEXT:    pinsrw $7, {{[0-9]+}}(%esp), %xmm0
 ; SSE41-32-NEXT:    retl
 ;
 ; SSE41-64-LABEL: test_buildvector_v8i16_split_v16i8:
 ; SSE41-64:       # %bb.0:
 ; SSE41-64-NEXT:    movd %edi, %xmm0
-; SSE41-64-NEXT:    shrl $8, %edi
-; SSE41-64-NEXT:    pinsrb $1, %edi, %xmm0
-; SSE41-64-NEXT:    pinsrb $2, %esi, %xmm0
-; SSE41-64-NEXT:    shrl $8, %esi
-; SSE41-64-NEXT:    pinsrb $3, %esi, %xmm0
-; SSE41-64-NEXT:    pinsrb $4, %edx, %xmm0
-; SSE41-64-NEXT:    shrl $8, %edx
-; SSE41-64-NEXT:    pinsrb $5, %edx, %xmm0
-; SSE41-64-NEXT:    pinsrb $6, %ecx, %xmm0
-; SSE41-64-NEXT:    shrl $8, %ecx
-; SSE41-64-NEXT:    pinsrb $7, %ecx, %xmm0
-; SSE41-64-NEXT:    pinsrb $8, %r8d, %xmm0
-; SSE41-64-NEXT:    shrl $8, %r8d
-; SSE41-64-NEXT:    pinsrb $9, %r8d, %xmm0
-; SSE41-64-NEXT:    pinsrb $10, %r9d, %xmm0
-; SSE41-64-NEXT:    shrl $8, %r9d
-; SSE41-64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
-; SSE41-64-NEXT:    pinsrb $11, %r9d, %xmm0
-; SSE41-64-NEXT:    pinsrb $12, %eax, %xmm0
-; SSE41-64-NEXT:    shrl $8, %eax
-; SSE41-64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
-; SSE41-64-NEXT:    pinsrb $13, %eax, %xmm0
-; SSE41-64-NEXT:    pinsrb $14, %ecx, %xmm0
-; SSE41-64-NEXT:    shrl $8, %ecx
-; SSE41-64-NEXT:    pinsrb $15, %ecx, %xmm0
+; SSE41-64-NEXT:    pinsrw $1, %esi, %xmm0
+; SSE41-64-NEXT:    pinsrw $2, %edx, %xmm0
+; SSE41-64-NEXT:    pinsrw $3, %ecx, %xmm0
+; SSE41-64-NEXT:    pinsrw $4, %r8d, %xmm0
+; SSE41-64-NEXT:    pinsrw $5, %r9d, %xmm0
+; SSE41-64-NEXT:    pinsrw $6, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-64-NEXT:    pinsrw $7, {{[0-9]+}}(%rsp), %xmm0
 ; SSE41-64-NEXT:    retq
 ;
 ; AVX-32-LABEL: test_buildvector_v8i16_split_v16i8:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    vmovd %eax, %xmm0
-; AVX-32-NEXT:    shrl $8, %eax
-; AVX-32-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    shrl $8, %eax
-; AVX-32-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    shrl $8, %eax
-; AVX-32-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    shrl $8, %eax
-; AVX-32-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    shrl $8, %eax
-; AVX-32-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    shrl $8, %eax
-; AVX-32-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    shrl $8, %eax
-; AVX-32-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    shrl $8, %eax
-; AVX-32-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $5, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $6, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $7, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-32-NEXT:    retl
 ;
 ; AVX-64-LABEL: test_buildvector_v8i16_split_v16i8:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vmovd %edi, %xmm0
-; AVX-64-NEXT:    shrl $8, %edi
-; AVX-64-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0
-; AVX-64-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
-; AVX-64-NEXT:    shrl $8, %esi
-; AVX-64-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; AVX-64-NEXT:    vpinsrb $4, %edx, %xmm0, %xmm0
-; AVX-64-NEXT:    shrl $8, %edx
-; AVX-64-NEXT:    vpinsrb $5, %edx, %xmm0, %xmm0
-; AVX-64-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; AVX-64-NEXT:    shrl $8, %ecx
-; AVX-64-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
-; AVX-64-NEXT:    vpinsrb $8, %r8d, %xmm0, %xmm0
-; AVX-64-NEXT:    shrl $8, %r8d
-; AVX-64-NEXT:    vpinsrb $9, %r8d, %xmm0, %xmm0
-; AVX-64-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
-; AVX-64-NEXT:    shrl $8, %r9d
-; AVX-64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
-; AVX-64-NEXT:    vpinsrb $11, %r9d, %xmm0, %xmm0
-; AVX-64-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX-64-NEXT:    shrl $8, %eax
-; AVX-64-NEXT:    movzwl {{[0-9]+}}(%rsp), %ecx
-; AVX-64-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX-64-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX-64-NEXT:    shrl $8, %ecx
-; AVX-64-NEXT:    vpinsrb $15, %ecx, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrw $1, %esi, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrw $2, %edx, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrw $4, %r8d, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrw $5, %r9d, %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrw $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT:    vpinsrw $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
 ; AVX-64-NEXT:    retq
   %a0.lo = trunc i16 %a0 to i8
   %a1.lo = trunc i16 %a1 to i8
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index d018c535ea8f7..67b516c4f0612 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -722,39 +722,27 @@ define i1 @ne_v4i256(<4 x i256> %a0) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovd %eax, %xmm0
-; AVX512-NEXT:    shrq $32, %rax
-; AVX512-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
 ; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    vpinsrd $2, %r10d, %xmm0, %xmm0
-; AVX512-NEXT:    shrq $32, %r10
-; AVX512-NEXT:    vpinsrd $3, %r10d, %xmm0, %xmm0
-; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT:    vmovd %r8d, %xmm1
-; AVX512-NEXT:    shrq $32, %r8
-; AVX512-NEXT:    vpinsrd $1, %r8d, %xmm1, %xmm1
+; AVX512-NEXT:    vmovq %r10, %xmm0
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r9
-; AVX512-NEXT:    vpinsrd $2, %r9d, %xmm1, %xmm1
-; AVX512-NEXT:    shrq $32, %r9
-; AVX512-NEXT:    vpinsrd $3, %r9d, %xmm1, %xmm1
+; AVX512-NEXT:    vmovq %r9, %xmm1
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r8
+; AVX512-NEXT:    vmovq %r8, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT:    vmovd %edx, %xmm1
-; AVX512-NEXT:    shrq $32, %rdx
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm1, %xmm1
 ; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
-; AVX512-NEXT:    shrq $32, %rcx
-; AVX512-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
-; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %rdi
-; AVX512-NEXT:    vmovd %edi, %xmm2
-; AVX512-NEXT:    shrq $32, %rdi
-; AVX512-NEXT:    vpinsrd $1, %edi, %xmm2, %xmm2
+; AVX512-NEXT:    vmovq %rcx, %xmm1
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %rsi
-; AVX512-NEXT:    vpinsrd $2, %esi, %xmm2, %xmm2
-; AVX512-NEXT:    shrq $32, %rsi
-; AVX512-NEXT:    vpinsrd $3, %esi, %xmm2, %xmm2
+; AVX512-NEXT:    vmovq %rsi, %xmm2
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT:    vmovq %rdi, %xmm3
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0

From f84ad4504dfba9df049296d451ec8da668e847a4 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 13 Nov 2025 11:47:12 +0000
Subject: [PATCH 29/29] [LLVM][InstCombine] not (bitcast (cmp A, B) --> bitcast
 (!cmp A, B) (#167693)

---
 .../InstCombine/InstCombineAndOrXor.cpp       | 10 ++-
 llvm/test/Transforms/InstCombine/not.ll       | 78 +++++++++++++++++++
 2 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index cbaff294819a2..ba5568b00441b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -5096,9 +5096,17 @@ Instruction *InstCombinerImpl::foldNot(BinaryOperator &I) {
     return &I;
   }
 
+  // not (bitcast (cmp A, B) --> bitcast (!cmp A, B)
+  if (match(NotOp, m_OneUse(m_BitCast(m_Value(X)))) &&
+      match(X, m_OneUse(m_Cmp(Pred, m_Value(), m_Value())))) {
+    cast<CmpInst>(X)->setPredicate(CmpInst::getInversePredicate(Pred));
+    return new BitCastInst(X, Ty);
+  }
+
   // Move a 'not' ahead of casts of a bool to enable logic reduction:
   // not (bitcast (sext i1 X)) --> bitcast (sext (not i1 X))
-  if (match(NotOp, m_OneUse(m_BitCast(m_OneUse(m_SExt(m_Value(X)))))) && X->getType()->isIntOrIntVectorTy(1)) {
+  if (match(NotOp, m_OneUse(m_BitCast(m_OneUse(m_SExt(m_Value(X)))))) &&
+      X->getType()->isIntOrIntVectorTy(1)) {
     Type *SextTy = cast<BitCastOperator>(NotOp)->getSrcTy();
     Value *NotX = Builder.CreateNot(X);
     Value *Sext = Builder.CreateSExt(NotX, SextTy);
diff --git a/llvm/test/Transforms/InstCombine/not.ll b/llvm/test/Transforms/InstCombine/not.ll
index d693b9d8f8557..1acf55a50208d 100644
--- a/llvm/test/Transforms/InstCombine/not.ll
+++ b/llvm/test/Transforms/InstCombine/not.ll
@@ -1061,3 +1061,81 @@ if.else:
   call void @f2()
   unreachable
 }
+
+define i8 @invert_bitcasted_icmp(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @invert_bitcasted_icmp(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <8 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[MASK_AS_INT:%.*]] = bitcast <8 x i1> [[CMP]] to i8
+; CHECK-NEXT:    ret i8 [[MASK_AS_INT]]
+;
+  %cmp = icmp sle <8 x i32> %a, %b
+  %mask.as.int = bitcast <8 x i1> %cmp to i8
+  %not = xor i8 %mask.as.int, 255
+  ret i8 %not
+}
+
+define i8 @invert_bitcasted_icmp_samesign(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @invert_bitcasted_icmp_samesign(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign sgt <8 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[MASK_AS_INT:%.*]] = bitcast <8 x i1> [[CMP]] to i8
+; CHECK-NEXT:    ret i8 [[MASK_AS_INT]]
+;
+  %cmp = icmp samesign sle <8 x i32> %a, %b
+  %mask.as.int = bitcast <8 x i1> %cmp to i8
+  %not = xor i8 %mask.as.int, 255
+  ret i8 %not
+}
+
+define i8 @invert_bitcasted_icmp_multi_use_1(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @invert_bitcasted_icmp_multi_use_1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle <8 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    call void (...) @llvm.fake.use(<8 x i1> [[CMP]])
+; CHECK-NEXT:    [[MASK_AS_INT:%.*]] = bitcast <8 x i1> [[CMP]] to i8
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[MASK_AS_INT]], -1
+; CHECK-NEXT:    ret i8 [[NOT]]
+;
+  %cmp = icmp sle <8 x i32> %a, %b
+  call void (...) @llvm.fake.use(<8 x i1> %cmp)
+  %mask.as.int = bitcast <8 x i1> %cmp to i8
+  %not = xor i8 %mask.as.int, -1
+  ret i8 %not
+}
+
+define i8 @invert_bitcasted_icmp_multi_use_2(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @invert_bitcasted_icmp_multi_use_2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle <8 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[MASK_AS_INT:%.*]] = bitcast <8 x i1> [[CMP]] to i8
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i8 [[MASK_AS_INT]])
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[MASK_AS_INT]], -1
+; CHECK-NEXT:    ret i8 [[NOT]]
+;
+  %cmp = icmp sle <8 x i32> %a, %b
+  %mask.as.int = bitcast <8 x i1> %cmp to i8
+  call void (...) @llvm.fake.use(i8 %mask.as.int)
+  %not = xor i8 %mask.as.int, -1
+  ret i8 %not
+}
+
+define i8 @invert_bitcasted_fcmp(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @invert_bitcasted_fcmp(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp uge <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[MASK_AS_INT:%.*]] = bitcast <8 x i1> [[CMP]] to i8
+; CHECK-NEXT:    ret i8 [[MASK_AS_INT]]
+;
+  %cmp = fcmp olt <8 x float> %a, %b
+  %mask.as.int = bitcast <8 x i1> %cmp to i8
+  %not = xor i8 %mask.as.int, 255
+  ret i8 %not
+}
+
+define i8 @invert_bitcasted_fcmp_fast(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @invert_bitcasted_fcmp_fast(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast uge <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[MASK_AS_INT:%.*]] = bitcast <8 x i1> [[CMP]] to i8
+; CHECK-NEXT:    ret i8 [[MASK_AS_INT]]
+;
+  %cmp = fcmp fast olt <8 x float> %a, %b
+  %mask.as.int = bitcast <8 x i1> %cmp to i8
+  %not = xor i8 %mask.as.int, 255
+  ret i8 %not
+}