From 69b80360127146e4c8faae14bb9729e144f913e0 Mon Sep 17 00:00:00 2001
From: Andrew Collins <ancollin@redhat.com>
Date: Fri, 31 Oct 2025 12:18:01 -0700
Subject: [PATCH 1/6] Changes from 500 VMs Hybrid work

Adds VM recovery playbooks
Enable hugepages on the hypervisor and VM configuration.
Add playbook to disable devices created for virtual functions
Working changes to configure hugetlb
Complaint of a missing var, this import seems required

Improve CSR approve and node Ready wait loop

Add interfaces that can be used as virtual functions
But they seemed to generate a lot of iowait activity on the VMs, so I don't know whether something is wrong with them or not

Some changes were generated using Cursor and the claude-4-sonnet model.

Signed-off-by: Andrew Collins <ancollin@redhat.com>

Apply suggestion from @mcornea

Co-authored-by: Marius Cornea <mcornea@redhat.com>
---
 .gitignore                                    |   3 +-
 ansible.cfg                                   |   5 +-
 ansible/.gitignore                            |   1 +
 ansible/copy-pull-secret.yml                  |  15 +++
 ansible/hv-vm-start-one.yml                   |   6 +
 ansible/hv-vm-stop-all.yml                    |   5 +
 ansible/mno-add-vm-workers.yml                | 121 ++++++++++++++++++
 ansible/roles/copy-pull-secret/tasks/main.yml |  11 ++
 ansible/roles/hv-install/defaults/main.yml    |  17 +++
 ansible/roles/hv-install/tasks/main.yml       |  67 ++++++++++
 .../templates/hugetlb-reserve-pages.sh.j2     |  15 +++
 ansible/roles/hv-vm-create/defaults/main.yml  |  16 ++-
 ansible/roles/hv-vm-create/tasks/main.yml     |  27 ++++
 .../hv-vm-create/templates/kvm-def.xml.j2     |  20 +++
 ansible/roles/hv-vm-destroy/tasks/main.yml    |   5 +
 ansible/roles/hv-vm-start/tasks/main.yml      |   5 +
 .../tasks/check_nodes_joined.yml              |  45 ++-----
 .../roles/ocp-scale-out-csr/tasks/main.yml    |  17 ++-
 .../tasks/set_hostname_role.yml               |   1 +
 ansible/vars/hv.sample.yml                    |  32 +++++
 ansible/vars/lab.yml                          |   2 +-
 ansible/vars/scale_out.sample.yml             |   4 +-
 ansible/vm-sriov-disable.yml                  |  25 ++++
 docs/deploy-vmno.md                           |  31 +++++
 docs/hypervisors.md                           |   6 +-
 docs/troubleshooting.md                       |  10 ++
 26 files changed, 458 insertions(+), 54 deletions(-)
 create mode 100644 ansible/copy-pull-secret.yml
 create mode 100644 ansible/hv-vm-start-one.yml
 create mode 100644 ansible/hv-vm-stop-all.yml
 create mode 100644 ansible/mno-add-vm-workers.yml
 create mode 100644 ansible/roles/copy-pull-secret/tasks/main.yml
 create mode 100644 ansible/roles/hv-install/defaults/main.yml
 create mode 100644 ansible/roles/hv-install/templates/hugetlb-reserve-pages.sh.j2
 create mode 100644 ansible/roles/hv-vm-destroy/tasks/main.yml
 create mode 100644 ansible/roles/hv-vm-start/tasks/main.yml
 create mode 100644 ansible/vm-sriov-disable.yml

diff --git a/.gitignore b/.gitignore
index 9c186b89..e9d84683 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,4 +16,5 @@ out
 gen
 .idea/
 .idea/workspace.xml
-
+*.log
+*.orig
diff --git a/ansible.cfg b/ansible.cfg
index 60dc3a2e..214988f0 100644
--- a/ansible.cfg
+++ b/ansible.cfg
@@ -1,3 +1,6 @@
 [defaults]
-interpreter_python=auto
+interpreter_python=auto_silent
 callbacks_enabled = profile_tasks
+deprecation_warnings = False
+log_path = ~/.ansible/jetlag-ansible.log
+display_args_to_stdout = True
diff --git a/ansible/.gitignore b/ansible/.gitignore
index 2768362a..f2310062 100644
--- a/ansible/.gitignore
+++ b/ansible/.gitignore
@@ -1 +1,2 @@
 smcipmitool.tar.gz
+*.sw*
diff --git a/ansible/copy-pull-secret.yml b/ansible/copy-pull-secret.yml
new file mode 100644
index 00000000..061a3039
--- /dev/null
+++ b/ansible/copy-pull-secret.yml
@@ -0,0 +1,15 @@
+---
+# Copy pull secret playbook
+#
+# This playbook is used to copy the pull secret to the nodes in the cluster.
+# It is used to updae the pull secret on nodes to pull images from the Red Hat registry.
+#
+# Example Usage:
+#
+# ansible-playbook ansible/copy-pull-secret.yml
+#
+
+- name: Copies pull secret to nodes
+  hosts: hv_vm
+  roles:
+  - copy-pull-secret
diff --git a/ansible/hv-vm-start-one.yml b/ansible/hv-vm-start-one.yml
new file mode 100644
index 00000000..f4f64662
--- /dev/null
+++ b/ansible/hv-vm-start-one.yml
@@ -0,0 +1,6 @@
+---
+- name: start one VMs
+  gather_facts: false
+  hosts: hv
+  roles:
+  - hv-vm-start
diff --git a/ansible/hv-vm-stop-all.yml b/ansible/hv-vm-stop-all.yml
new file mode 100644
index 00000000..ee8db9a4
--- /dev/null
+++ b/ansible/hv-vm-stop-all.yml
@@ -0,0 +1,5 @@
+---
+- name: destroy all VMs
+  hosts: hv
+  roles:
+  - hv-vm-destroy
diff --git a/ansible/mno-add-vm-workers.yml b/ansible/mno-add-vm-workers.yml
new file mode 100644
index 00000000..142bd1d3
--- /dev/null
+++ b/ansible/mno-add-vm-workers.yml
@@ -0,0 +1,121 @@
+---
+# Create and deploy a cluster with the Assisted Installer
+#
+# Example Usage:
+#
+# ansible-playbook -i ansible/inventory/cloud42.local ansible/mno-deploy.yml
+#
+
+- name: Prep cluster to add hosts
+  hosts: bastion
+  vars_files:
+  - vars/lab.yml
+  - vars/all.yml
+  gather_facts: false
+  tasks:
+  - name: Set assisted installer connection
+    set_fact:
+      assisted_installer_host: "{{ groups['bastion'][0] }}"
+      assisted_installer_port: "8090"
+
+  - name: Get cluster status
+    uri:
+      url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/clusters/{{ ai_cluster_id }}"
+      method: GET
+      body_format: json
+      status_code: [200]
+      return_content: true
+    register: cluster_data
+    failed_when: cluster_data.json.status not in ['installed', 'adding-hosts']
+
+  - name: Set cluster status to adding-hosts
+    uri:
+      url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/clusters/{{ ai_cluster_id }}/actions/allow-add-workers"
+      method: POST
+      body_format: json
+      status_code: [201, 202]
+    when: cluster_data.json.status == 'installed'
+
+  - name: Get infra-env
+    uri:
+      url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/clusters/{{ ai_cluster_id }}"
+      method: GET
+      body_format: json
+      status_code: [200]
+      return_content: true
+    register: infra_env_return
+
+  - name: Set ai_infraenv_id
+    set_fact:
+      ai_infraenv_id: "{{ infra_env_return.json.hosts[0].infra_env_id }}"
+
+  - name: Get infra-env static_network_config
+    uri:
+      url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/infra-envs/{{ ai_infraenv_id }}"
+      method: GET
+      body_format: json
+      status_code: [200]
+      return_content: true
+    register: infra_env_return
+
+  - name: Set ai_infraenv_static_config
+    set_fact:
+      ai_infraenv_static_config: "{{ infra_env_return.json.static_network_config }}"
+
+  - name: Set empty static network configuration
+    set_fact:
+      static_network_config: []
+
+  - name: Generate Static Network Config for VMs
+    ansible.builtin.include_role:
+      name: create-ai-cluster
+      tasks_from: static_network_config
+    vars:
+      hybrid_worker_count: "{{ add_worker_count }}"
+    loop: "{{ groups['hv_vm'][:hybrid_worker_count | int] }}"
+
+  - name: show ai_infraenv_static_config
+    debug:
+      var: ai_infraenv_static_config
+
+  - name: show static_network_config
+    debug:
+      var: static_network_config
+
+  - name: Set static network composite
+    set_fact:
+      static_network_config_comp: "{{ static_network_config + ai_infraenv_static_config }}"
+
+  - name: show static_network_config composite
+    debug:
+      var: static_network_config_comp
+
+  - name: Update static config
+    uri:
+      url: "http://{{ assisted_installer_host }}:{{ assisted_installer_port }}/api/assisted-install/v2/infra-envs/{{ ai_infraenv_id }}"
+      body: {
+        "static_network_config": "{{ static_network_config + ai_infraenv_static_config }}"
+      }
+      method: PATCH
+      body_format: json
+      status_code: [201]
+      return_content: true
+
+
+- name: Boot / Install VMs
+  hosts: bastion
+  vars_files:
+  - vars/lab.yml
+  - vars/all.yml
+  roles:
+  - generate-discovery-iso
+  - role: boot-iso
+    vars:
+      inventory_group: hv_vm
+      index: "{{ add_worker_count }}"
+      virtual_media_iso: "discovery.iso"
+  - role: wait-hosts-discovered
+    vars:
+      inventory_nodes: "{{ groups['hv_vm'][:add_worker_count|int] }}"
+      discover_nodes: "{{ groups['hv_vm'][:add_worker_count|int] }}"
+  - add-hosts-install
diff --git a/ansible/roles/copy-pull-secret/tasks/main.yml b/ansible/roles/copy-pull-secret/tasks/main.yml
new file mode 100644
index 00000000..078e04fb
--- /dev/null
+++ b/ansible/roles/copy-pull-secret/tasks/main.yml
@@ -0,0 +1,11 @@
+---
+- name: Copy pull secret
+  copy:
+    src: pull-secret.json
+    dest: "/var/lib/kubelet/config.json"
+  become: true
+- name: touch force update
+  file:
+    path: /run/machine-config-daemon-force
+    state: touch
+  become: true
diff --git a/ansible/roles/hv-install/defaults/main.yml b/ansible/roles/hv-install/defaults/main.yml
new file mode 100644
index 00000000..bfd2265b
--- /dev/null
+++ b/ansible/roles/hv-install/defaults/main.yml
@@ -0,0 +1,17 @@
+---
+# hv-install default vars
+
+# Hugepages configuration for hypervisors
+enable_hugepages: false
+
+# Hugepage size: 2M or 1G
+hugepage_size: "1G"
+
+# Number of hugepages to allocate (e.g., 32 for 32GB of 1G hugepages)
+hugepage_count: 32
+
+# Additional kernel parameters for performance tuning
+additional_kernel_params: []
+
+# Number of hugepages per node (e.g. total / 2)
+hugepages_count_per_node: 190
diff --git a/ansible/roles/hv-install/tasks/main.yml b/ansible/roles/hv-install/tasks/main.yml
index 4451e0c4..a73d5ee9 100644
--- a/ansible/roles/hv-install/tasks/main.yml
+++ b/ansible/roles/hv-install/tasks/main.yml
@@ -21,6 +21,55 @@
     name: sushy-tools
     version: 1.2.0
 
+- name: Configure hugepages support
+  when: enable_hugepages
+  block:
+
+  - name: Run grubby to add hugepages arguments
+    command: grubby --update-kernel=ALL --args="default_hugepagesz={{ hugepage_size }} hugepagesz={{ hugepage_size }}"
+    register: grub_updated
+
+  - name: Set reboot required flag
+    set_fact:
+      hugepages_reboot_required: true
+    when: grub_updated.changed
+
+  - name: Create hugetlb-gigantic-pages.service file
+    copy:
+      dest: /usr/lib/systemd/system/hugetlb-gigantic-pages.service
+      content: |
+        [Unit]
+        Description=HugeTLB Gigantic Pages Reservation
+        DefaultDependencies=no
+        Before=dev-hugepages.mount
+        ConditionPathExists=/sys/devices/system/node
+        ConditionKernelCommandLine=hugepagesz=1G
+
+        [Service]
+        Type=oneshot
+        RemainAfterExit=yes
+        ExecStart=/usr/lib/systemd/hugetlb-reserve-pages.sh
+
+        [Install]
+        WantedBy=sysinit.target
+
+  - name: Create hugetlb-reserve-pages.sh
+    template:
+      src: hugetlb-reserve-pages.sh.j2
+      dest: /usr/lib/systemd/hugetlb-reserve-pages.sh
+      mode: "0755"
+    register: hugetlb_script
+
+  - name: Set reboot required flag
+    set_fact:
+      hugepages_reboot_required: true
+    when: hugetlb_script.changed
+
+  - name: Enable hugetlb-gigantic-pages.service
+    systemd:
+      enabled: true
+      name: hugetlb-gigantic-pages.service
+
 - name: Get coredns
   get_url:
     validate_certs: false
@@ -65,3 +114,21 @@
     state: started
     enabled: true
     name: ksmtuned
+
+- name: Reboot hypervisor for hugepages configuration
+  when:
+  - enable_hugepages
+  - hugepages_reboot_required | default(false)
+  block:
+  - name: Reboot hypervisor
+    reboot:
+      msg: "Rebooting to apply hugepages configuration"
+      reboot_timeout: 600
+
+  - name: Verify hugepages are configured
+    shell: cat /proc/meminfo | grep -E "HugePages_Total|HugePages_Free|Hugepagesize"
+    register: hugepages_status
+
+  - name: Display hugepages status
+    debug:
+      msg: "{{ hugepages_status.stdout_lines }}"
diff --git a/ansible/roles/hv-install/templates/hugetlb-reserve-pages.sh.j2 b/ansible/roles/hv-install/templates/hugetlb-reserve-pages.sh.j2
new file mode 100644
index 00000000..5bed529d
--- /dev/null
+++ b/ansible/roles/hv-install/templates/hugetlb-reserve-pages.sh.j2
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+nodes_path=/sys/devices/system/node/
+if [ ! -d $nodes_path ]; then
+    echo "ERROR: $nodes_path does not exist"
+    exit 1
+fi
+
+reserve_pages()
+{
+    echo $1 > $nodes_path/$2/hugepages/hugepages-1048576kB/nr_hugepages
+}
+
+reserve_pages {{ hugepages_count_per_node }} node0
+reserve_pages {{ hugepages_count_per_node }} node1
diff --git a/ansible/roles/hv-vm-create/defaults/main.yml b/ansible/roles/hv-vm-create/defaults/main.yml
index 7585df82..bd0f1690 100644
--- a/ansible/roles/hv-vm-create/defaults/main.yml
+++ b/ansible/roles/hv-vm-create/defaults/main.yml
@@ -10,7 +10,7 @@ vnuma_enabled: false
 vnuma_memory_placement: "static"
 vnuma_cpu_placement: "static"
 
-# Manual vNUMA configuration 
+# Manual vNUMA configuration
 # vnuma_nodes:
 #   - id: 0
 #     cpus: "0-3"
@@ -21,4 +21,16 @@ vnuma_cpu_placement: "static"
 
 # vNUMA topology settings
 vnuma_memory_mode: "strict"  # strict, preferred, interleave
-vnuma_cpu_mode: "strict"     # strict, preferred                                                
+vnuma_cpu_mode: "strict"     # strict, preferred
+
+# Hugepages configuration for VMs
+enable_vm_hugepages: false
+
+# Hugepage size for VMs: 2M or 1G
+vm_hugepage_size: "1G"
+
+# Number of hugepages to allocate per VM (auto-calculated based on VM memory if not specified)
+vm_hugepage_count:
+
+# Hugepage mount path in VMs
+vm_hugepage_mount: "/mnt/hugepages"
diff --git a/ansible/roles/hv-vm-create/tasks/main.yml b/ansible/roles/hv-vm-create/tasks/main.yml
index 875407a8..293e84bc 100644
--- a/ansible/roles/hv-vm-create/tasks/main.yml
+++ b/ansible/roles/hv-vm-create/tasks/main.yml
@@ -20,6 +20,33 @@
   set_fact:
     hv_vm_cpu_count: "{{ hostvars[inventory_hostname]['cpus'] }}"
 
+- name: Configure VM hugepages
+  when: enable_vm_hugepages
+  block:
+    - name: Calculate hugepages needed for VM if not specified
+      set_fact:
+        calculated_vm_hugepage_count: "{{ (hostvars[inventory_hostname]['memory'] | int) // (vm_hugepage_size[:-1] | int) }}"
+      when: vm_hugepage_count is not defined or vm_hugepage_count == ""
+
+    - name: Set hugepage count for VM
+      set_fact:
+        vm_hugepages_needed: "{{ vm_hugepage_count if vm_hugepage_count is defined and vm_hugepage_count != '' else calculated_vm_hugepage_count }}"
+
+    - name: Check host hugepages availability
+      shell: |
+        grep -E "HugePages_Free.*{{ vm_hugepage_size }}" /proc/meminfo | awk '{print $2}' || echo "0"
+      register: host_hugepages_free
+      delegate_to: "{{ hostvars[inventory_hostname]['ansible_host'] }}"
+
+    - name: Validate sufficient hugepages available
+      fail:
+        msg: "Not enough {{ vm_hugepage_size }} hugepages available on host {{ hostvars[inventory_hostname]['ansible_host'] }}. Need: {{ vm_hugepages_needed }}, Available: {{ host_hugepages_free.stdout }}"
+      when: (host_hugepages_free.stdout | int) < (vm_hugepages_needed | int)
+
+    - name: Display hugepages configuration for VM
+      debug:
+        msg: "VM {{ inventory_hostname }} will use {{ vm_hugepages_needed }} {{ vm_hugepage_size }} hugepages ({{ (vm_hugepages_needed | int) * (vm_hugepage_size[:-1] | int) }}G total)"
+
 - name: Set vNUMA configuration tasks
   when: vnuma_enabled
   block:
diff --git a/ansible/roles/hv-vm-create/templates/kvm-def.xml.j2 b/ansible/roles/hv-vm-create/templates/kvm-def.xml.j2
index df33d85a..6b671470 100644
--- a/ansible/roles/hv-vm-create/templates/kvm-def.xml.j2
+++ b/ansible/roles/hv-vm-create/templates/kvm-def.xml.j2
@@ -3,6 +3,13 @@
   <uuid>{{ hostvars[inventory_hostname]['domain_uuid'] }}</uuid>
   <memory unit='GiB'>{{ hostvars[inventory_hostname]['memory'] }}</memory>
   <currentMemory unit='GiB'>{{ hostvars[inventory_hostname]['memory'] }}</currentMemory>
+{% if enable_vm_hugepages %}
+  <memoryBacking>
+    <hugepages>
+      <page size='{{ vm_hugepage_size[:-1] }}' unit='{{ vm_hugepage_size[-1] }}'/>
+    </hugepages>
+  </memoryBacking>
+{% endif %}
   <vcpu placement='static'>{{ hv_vm_cpu_count | int }}</vcpu>
   <os>
     <type arch='x86_64' machine='pc-q35-rhel7.6.0'>hvm</type>
@@ -11,6 +18,7 @@
   <features>
     <acpi/>
     <apic/>
+    <ioapic driver='qemu'/>
   </features>
 {% if vnuma_enabled %}
   <cpu mode='host-model' check='partial'>
@@ -125,6 +133,18 @@
 {% endif %}
       <address type='pci' domain='0x0000' bus='0x01' slot='0x00' function='0x0'/>
     </interface>
+{% for i in range(1, 6) %}
+    <interface type='bridge'>
+{% set mac_prefix = "%s:%02x" | format('52:54:00',i) %}
+      <mac address='{{ mac_prefix | community.general.random_mac(seed=inventory_hostname) }}'/>
+      <source bridge='br0'/>
+      <model type='igb'/>
+      <address type='pci' domain='0x0000' bus='{{ "0x%02x" | format(i + 4) }}' slot='0x00' function='0x0'/>
+    </interface>
+{% endfor %}
+    <iommu model='intel'>
+      <driver intremap='on'/>
+    </iommu>
     <serial type='pty'>
       <target type='isa-serial' port='0'>
         <model name='isa-serial'/>
diff --git a/ansible/roles/hv-vm-destroy/tasks/main.yml b/ansible/roles/hv-vm-destroy/tasks/main.yml
new file mode 100644
index 00000000..28049f8f
--- /dev/null
+++ b/ansible/roles/hv-vm-destroy/tasks/main.yml
@@ -0,0 +1,5 @@
+---
+- name: Stop all vms
+  shell:
+    for i in $(virsh list --all --name | grep vm) ; do virsh destroy $i ; done
+  become: true
diff --git a/ansible/roles/hv-vm-start/tasks/main.yml b/ansible/roles/hv-vm-start/tasks/main.yml
new file mode 100644
index 00000000..6f9d711e
--- /dev/null
+++ b/ansible/roles/hv-vm-start/tasks/main.yml
@@ -0,0 +1,5 @@
+---
+- name: Start one vm
+  shell:
+    for i  in $(virsh list --all --name  --state-shutoff | grep vm |head -1) ; do virsh start $i ; done
+  become: true
diff --git a/ansible/roles/ocp-scale-out-csr/tasks/check_nodes_joined.yml b/ansible/roles/ocp-scale-out-csr/tasks/check_nodes_joined.yml
index f2be9be4..853ff866 100644
--- a/ansible/roles/ocp-scale-out-csr/tasks/check_nodes_joined.yml
+++ b/ansible/roles/ocp-scale-out-csr/tasks/check_nodes_joined.yml
@@ -1,15 +1,5 @@
 ---
-- name: Set Facts to recurse with
-  set_fact:
-    r_qry: "{{ qry }}"
-    r_worker_counter: "{{ worker_counter }}"
-
-- name: Set KUBECONFIG path based on cluster type
-  set_fact:
-    cluster_kubeconfig: "{{ bastion_cluster_config_dir }}/{{ 'kubeconfig' if cluster_type != 'sno' else groups['sno'][0] + '/kubeconfig' }}"
-
-- name: approve CSRs and check if nodes have joined the cluster
-  block:
+- block:
     - name: Increment the retry count
       set_fact:
         retry: "{{ 0 if retry is undefined else retry | int + 1 }}"
@@ -19,35 +9,21 @@
         seconds: "30"
       when: retry|int > 0
 
-    - name: Get CSRs
+    - name: Get Pending CSRs
       shell: |
-        KUBECONFIG={{ cluster_kubeconfig }} oc get csr -o json
+        KUBECONFIG={{ bastion_cluster_config_dir }}/kubeconfig oc get csr --no-headers | grep Pending | awk '{ print $1 }'
       register: oc_get_csr
     
     - name: Approve pending CSRs
       shell: |
-        KUBECONFIG={{ cluster_kubeconfig }} oc adm certificate approve {{ item.metadata.name }}
-      loop: "{{ oc_get_csr.stdout | from_json | json_query(qry) }}"
-      loop_control:
-        label: "{{ item.metadata.name }}"
-    
-    - name: Get worker node count
-      shell: |
-        KUBECONFIG={{ cluster_kubeconfig }} oc get nodes | {{ worker_counter }}
-      register: oc_get_nodes_workers
-
-    - name: Current Worker Node Count
-      debug:
-        var: oc_get_nodes_workers.stdout
-
-    - name: Waiting for Worker Node Count
-      debug:
-        msg: "{{ current_worker_count+scale_out_count }}"
+              KUBECONFIG={{ bastion_cluster_config_dir }}/kubeconfig oc adm certificate approve {{ item }}
+      loop: "{{ oc_get_csr.stdout_lines }}"
+      when: oc_get_csr.stdout_lines | length > 0
 
-    - name: Raise fail to trigger retry if all nodes didn't meet requirments
+    - name: Raise fail to trigger retry if CSRs still Pending
       fail:
-        msg: All nodes have not met check requirements
-      when: oc_get_nodes_workers.stdout|int < current_worker_count+scale_out_count
+        msg: CSRs still pending. Try again
+      when: oc_get_csr.stdout_lines |length > 0
   rescue:
     - name: Fail on maximum retry count
       fail:
@@ -56,6 +32,3 @@
 
     - name: Retry the check
       include_tasks: check_nodes_joined.yml
-      vars:
-        qry: "{{ r_qry }}"
-        worker_counter: "{{ r_worker_counter }}"
diff --git a/ansible/roles/ocp-scale-out-csr/tasks/main.yml b/ansible/roles/ocp-scale-out-csr/tasks/main.yml
index 13e64a94..e5bc2f2f 100644
--- a/ansible/roles/ocp-scale-out-csr/tasks/main.yml
+++ b/ansible/roles/ocp-scale-out-csr/tasks/main.yml
@@ -1,12 +1,11 @@
 ---
-- name: Approve node-bootstrapper CSRs and wait for nodes to join cluster
+- name: Approve CSRs
   include_tasks: check_nodes_joined.yml
-  vars:
-    qry: "items[?status.conditions==null && spec.username == 'system:serviceaccount:openshift-machine-config-operator:node-bootstrapper']"
-    worker_counter: "grep worker | grep -v -c master"
 
-- name: Approve Kublet-serving CSRs and wait for nodes to join cluster
-  include_tasks: check_nodes_joined.yml
-  vars:
-    qry: "items[?status.conditions==null && spec.signerName == 'kubernetes.io/kubelet-serving']"
-    worker_counter: "grep worker | grep -v master | grep -c -v NotReady"
+- name: Wait for expected number of workers to be Ready
+  shell: |
+    KUBECONFIG={{ bastion_cluster_config_dir }}/kubeconfig oc get nodes --no-headers -l node-role.kubernetes.io/worker | grep -c -v NotReady
+  register: oc_get_nodes_workers
+  until: oc_get_nodes_workers.stdout|int < current_worker_count+scale_out_count
+  retries: 540
+  delay: 30
diff --git a/ansible/roles/wait-hosts-discovered/tasks/set_hostname_role.yml b/ansible/roles/wait-hosts-discovered/tasks/set_hostname_role.yml
index 807b3b11..fe951ca3 100644
--- a/ansible/roles/wait-hosts-discovered/tasks/set_hostname_role.yml
+++ b/ansible/roles/wait-hosts-discovered/tasks/set_hostname_role.yml
@@ -54,3 +54,4 @@
         "host_name": "{{ hostname }}",
         "host_role": "{{ host_role }}"
     }
+  ignore_errors: yes
diff --git a/ansible/vars/hv.sample.yml b/ansible/vars/hv.sample.yml
index b50b2551..70217973 100644
--- a/ansible/vars/hv.sample.yml
+++ b/ansible/vars/hv.sample.yml
@@ -48,3 +48,35 @@ hv_vm_manifest_acm_cr: true
 use_bastion_registry: false
 # Provide pull-secret for connected manifests
 pull_secret: "{{ lookup('file', '../pull-secret.txt') | b64encode }}"
+
+################################################################################
+# Hugepages Configuration
+################################################################################
+
+# Enable hugepages on hypervisors
+enable_hugepages: false
+
+# Hugepage size for hypervisors: 2M or 1G
+hugepage_size: "1G"
+
+# Number of hugepages to allocate on hypervisors (e.g., 64 for 64GB of 1G hugepages)
+# Calculate based on total memory and VM requirements
+hugepage_count: 64
+
+# Additional kernel parameters for performance tuning
+additional_kernel_params:
+  - "intel_iommu=on"
+  - "iommu=pt"
+  - "isolcpus=2-15,18-31"
+
+# Enable hugepages for VMs
+enable_vm_hugepages: false
+
+# Hugepage size for VMs (should match hypervisor hugepage_size)
+vm_hugepage_size: "1G"
+
+# Number of hugepages per VM (auto-calculated based on VM memory if not specified)
+# vm_hugepage_count: 18
+
+# Enable vNUMA for performance (recommended with hugepages)
+vnuma_enabled: false
diff --git a/ansible/vars/lab.yml b/ansible/vars/lab.yml
index 7f7ca2d4..81d54250 100644
--- a/ansible/vars/lab.yml
+++ b/ansible/vars/lab.yml
@@ -218,7 +218,7 @@ hw_vm_counts:
       nvme0n1: 12
     r650:
       default: 4
-      nvme0n1: 23
+      nvme0n1: 16
     r660:
       default: 4
       nvme0n1: 23
diff --git a/ansible/vars/scale_out.sample.yml b/ansible/vars/scale_out.sample.yml
index 30133fef..4b0b7990 100644
--- a/ansible/vars/scale_out.sample.yml
+++ b/ansible/vars/scale_out.sample.yml
@@ -3,9 +3,9 @@
 # This assumes they are all listed in the worker inventory
 # group. This varable is an offset used to skip worker node
 # records in the worker inventory group.
-current_worker_count: 120
+current_worker_count: 3
 
 # Set this to the number of worker nodes being added to the
 # cluster. At minimum, current_worker_count + scale_out_count
 # inventory records must exist in the inventory file.
-scale_out_count: 100
+scale_out_count: 3
diff --git a/ansible/vm-sriov-disable.yml b/ansible/vm-sriov-disable.yml
new file mode 100644
index 00000000..9fdba3c2
--- /dev/null
+++ b/ansible/vm-sriov-disable.yml
@@ -0,0 +1,25 @@
+---
+# Disables igb VFs from attempting to connect, which never succeeds and thus drives up CPU across all the workers.
+#
+# Exepects an inventory that has only the [worker] block, as with the normal inventory created where workers show up under [worker] and [hv_vm] seem to have some variables that affect how the node is accessed.
+#
+# Example Usage:
+#
+# ansible-playbook -i ansible/inventory/cloud42.local ansible/vm-sriov-disable.yml
+#
+
+- name: Disable all fake sr-iov devices and connections
+  gather_facts: false
+  hosts: worker
+  tasks:
+    - name: devices down
+      shell:
+        for i in {5..9} ; do for j in {0..6} ; do nmcli d down enp${i}s0v${j}  ; done ; done
+      become: true
+      ignore_errors: true
+
+    - name: connections autoconnect off
+      shell:
+        for i in $( nmcli conn show | grep "Wired connection" | awk '{ print $4 }' ) ; do nmcli conn mod $i connection.autoconnect no ; done
+      become: true
+      ignore_errors: true
diff --git a/docs/deploy-vmno.md b/docs/deploy-vmno.md
index 109a0a1f..6d3bb724 100644
--- a/docs/deploy-vmno.md
+++ b/docs/deploy-vmno.md
@@ -123,9 +123,21 @@ hw_vm_counts:
       nvme0n1: 7
 ```
 
+When mixing different machines, the hv_vm_counts may be adjusted for those machine models to create the same number of VMs per hypervisor. For example, when mixing Dell r640 and r650 in ScaleLab, the following counts were used:
+
+```yaml
+hw_vm_counts:
+  scalelab:
+    r650:
+      default: 4
+      nvme0n1: 16
+```
+
 > [!NOTE]
 > Depending upon your hardware, you may have to parition and format a 2nd disk to help store VM disk files.
 
+In some VM scenarios, hugepages may be required. To configure VMs with hugepages, enable with the variable `enable_hugepages`, and configure specifics with other similar variables found in: `ansible/roles/hv-install/defaults/main.yml`.
+
 ## Configure Ansible vars in `hv.yml`
 
 ```console
@@ -484,3 +496,22 @@ vm00008   Ready    worker                 1d    v1.31.7
 (.ansible) [root@<bastion> jetlag]# cat /root/vmno/kubeadmin-password
 xxxxx-xxxxx-xxxxx-xxxxx
 ```
+
+## Additional helper playbooks for VM management
+
+If VMs become unresponsive, sometimes destroying and restarting them is the only remedy. Since the garbage cleanup of pods of all VMs on a single hypervisor at a time can cause stalling, it also may be beneficial to start one VM per HV at a time. Playbooks have been added for all of these tasks.
+
+See the following playbooks to help in these cases:
+```
+ansible/hv-vm-stop-all.yml
+ansible/hv-vm-start-one.yml
+```
+
+## Disabling NetworkManager devices and connections for SR-IOV devices on VMs
+
+When VMs are created with SR-IOV devices using the IGB driver, the devices and connections will never fully initialize. NetworkManager repeatedly attempts to start them, which results in a large amount of churn on the VMs. A workaround to this churn is to force the devices down and connections' autoconnect off for those created for the interfaces.
+
+See the following playbook:
+```
+ansible/vm-sriov-disable.yml
+```
\ No newline at end of file
diff --git a/docs/hypervisors.md b/docs/hypervisors.md
index d050123b..1d605a49 100644
--- a/docs/hypervisors.md
+++ b/docs/hypervisors.md
@@ -112,16 +112,18 @@ Check if the servers in your allocation support NUMA config:
 ```
 
 Example output indicating NUMA support:
+```console
 NUMA node(s):          2
 NUMA node0 CPU(s):     0-11,24-35
 NUMA node1 CPU(s):     12-23,36-47
+```
 
-Add this var to your ansible/vars/all.yml file to enable vnuma config for virtual deployments:
+Add this var to your `ansible/vars/all.yml` file to enable vnuma config for virtual deployments:
 ```yaml
 vnuma_enabled: true
 ```
 
-Refer to ansible/roles/hv-vm-create/defaults/main.yml for other vNUMA configuration options.
+Refer to `ansible/roles/hv-vm-create/defaults/main.yml` for other vNUMA configuration options.
 
 ## Create/Delete/Replace VMs
 
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 2d47c9a1..ef6a3d9a 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -12,6 +12,7 @@ _**Table of Contents**_
   - [Failed on Wait for cluster to be ready](#failed-on-wait-for-cluster-to-be-ready)
   - [Failed on Adjust by-path selected install disk](#failed-on-adjust-by-path-selected-install-disk)
   - [Failed on Insert Virtual Media](#failed-on-insert-virtual-media)
+  - [Failing ImagePull due to Pull Secret](#failing-imagepull-due-to-pull-secret)
 - [Bastion](#bastion)
   - [Accessing services](#accessing-services)
   - [Clean all container services / podman pods](#clean-all-container-services--podman-pods)
@@ -274,6 +275,15 @@ racadm>>set iDRAC.VirtualMedia.Attached Attached
 Object value modified successfully
 ```
 
+## Failing ImagePull due to Pull Secret
+
+If a cluster has been running for some time or has changed hands between owners, there is a chance the pull secret supplied at install time may have expired.  If the cluster is degraded enough as a result, the control plane will not be able to update the kubelet's pull secret automatically.
+
+For this emergency scenario, a playbook has been created that should hopefully help:
+```
+ansible-playbook ansible/copy-pull-secret.yml
+```
+
 # Bastion
 
 ## Accessing services

From e5a7d64eb03d8cf830cda6bd32933f0e282e8ead Mon Sep 17 00:00:00 2001
From: Andrew Collins <ancollin@redhat.com>
Date: Thu, 20 Nov 2025 10:14:59 -0800
Subject: [PATCH 2/6] Feedback from PR

Renamed variable to remove "enabled"
Added new variable to switch ON VFs in VMs. Defaults to OFF.
---
 ansible/roles/hv-vm-create/defaults/main.yml        | 5 ++++-
 ansible/roles/hv-vm-create/tasks/main.yml           | 2 +-
 ansible/roles/hv-vm-create/templates/kvm-def.xml.j2 | 6 +++++-
 ansible/roles/hv-vm-start/tasks/main.yml            | 2 +-
 ansible/vars/hv.sample.yml                          | 5 ++++-
 docs/troubleshooting.md                             | 9 ++++++---
 6 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/ansible/roles/hv-vm-create/defaults/main.yml b/ansible/roles/hv-vm-create/defaults/main.yml
index bd0f1690..527ba186 100644
--- a/ansible/roles/hv-vm-create/defaults/main.yml
+++ b/ansible/roles/hv-vm-create/defaults/main.yml
@@ -24,7 +24,7 @@ vnuma_memory_mode: "strict"  # strict, preferred, interleave
 vnuma_cpu_mode: "strict"     # strict, preferred
 
 # Hugepages configuration for VMs
-enable_vm_hugepages: false
+vm_hugepages: false
 
 # Hugepage size for VMs: 2M or 1G
 vm_hugepage_size: "1G"
@@ -34,3 +34,6 @@ vm_hugepage_count:
 
 # Hugepage mount path in VMs
 vm_hugepage_mount: "/mnt/hugepages"
+
+# Enable IGB NICs for VMs
+vm_igb_nics: false
\ No newline at end of file
diff --git a/ansible/roles/hv-vm-create/tasks/main.yml b/ansible/roles/hv-vm-create/tasks/main.yml
index 293e84bc..f78add51 100644
--- a/ansible/roles/hv-vm-create/tasks/main.yml
+++ b/ansible/roles/hv-vm-create/tasks/main.yml
@@ -21,7 +21,7 @@
     hv_vm_cpu_count: "{{ hostvars[inventory_hostname]['cpus'] }}"
 
 - name: Configure VM hugepages
-  when: enable_vm_hugepages
+  when: vm_hugepages
   block:
     - name: Calculate hugepages needed for VM if not specified
       set_fact:
diff --git a/ansible/roles/hv-vm-create/templates/kvm-def.xml.j2 b/ansible/roles/hv-vm-create/templates/kvm-def.xml.j2
index 6b671470..197e2fca 100644
--- a/ansible/roles/hv-vm-create/templates/kvm-def.xml.j2
+++ b/ansible/roles/hv-vm-create/templates/kvm-def.xml.j2
@@ -3,7 +3,7 @@
   <uuid>{{ hostvars[inventory_hostname]['domain_uuid'] }}</uuid>
   <memory unit='GiB'>{{ hostvars[inventory_hostname]['memory'] }}</memory>
   <currentMemory unit='GiB'>{{ hostvars[inventory_hostname]['memory'] }}</currentMemory>
-{% if enable_vm_hugepages %}
+{% if vm_hugepages %}
   <memoryBacking>
     <hugepages>
       <page size='{{ vm_hugepage_size[:-1] }}' unit='{{ vm_hugepage_size[-1] }}'/>
@@ -18,7 +18,9 @@
   <features>
     <acpi/>
     <apic/>
+{% if vm_igb_nics | default(false) %}
     <ioapic driver='qemu'/>
+{% endif %}
   </features>
 {% if vnuma_enabled %}
   <cpu mode='host-model' check='partial'>
@@ -133,6 +135,7 @@
 {% endif %}
       <address type='pci' domain='0x0000' bus='0x01' slot='0x00' function='0x0'/>
     </interface>
+{% if vm_igb_nics | default(false) %}
 {% for i in range(1, 6) %}
     <interface type='bridge'>
 {% set mac_prefix = "%s:%02x" | format('52:54:00',i) %}
@@ -145,6 +148,7 @@
     <iommu model='intel'>
       <driver intremap='on'/>
     </iommu>
+{% endif %}
     <serial type='pty'>
       <target type='isa-serial' port='0'>
         <model name='isa-serial'/>
diff --git a/ansible/roles/hv-vm-start/tasks/main.yml b/ansible/roles/hv-vm-start/tasks/main.yml
index 6f9d711e..e278efff 100644
--- a/ansible/roles/hv-vm-start/tasks/main.yml
+++ b/ansible/roles/hv-vm-start/tasks/main.yml
@@ -1,5 +1,5 @@
 ---
 - name: Start one vm
   shell:
-    for i  in $(virsh list --all --name  --state-shutoff | grep vm |head -1) ; do virsh start $i ; done
+    for i in $(virsh list --all --name  --state-shutoff | grep vm |head -1) ; do virsh start $i ; done
   become: true
diff --git a/ansible/vars/hv.sample.yml b/ansible/vars/hv.sample.yml
index 70217973..288e4959 100644
--- a/ansible/vars/hv.sample.yml
+++ b/ansible/vars/hv.sample.yml
@@ -70,7 +70,7 @@ additional_kernel_params:
   - "isolcpus=2-15,18-31"
 
 # Enable hugepages for VMs
-enable_vm_hugepages: false
+vm_hugepages: false
 
 # Hugepage size for VMs (should match hypervisor hugepage_size)
 vm_hugepage_size: "1G"
@@ -80,3 +80,6 @@ vm_hugepage_size: "1G"
 
 # Enable vNUMA for performance (recommended with hugepages)
 vnuma_enabled: false
+
+# Enable IGB NICs for VMs
+vm_igb_nics: false
\ No newline at end of file
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index ef6a3d9a..c3bc5ba4 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -12,7 +12,7 @@ _**Table of Contents**_
   - [Failed on Wait for cluster to be ready](#failed-on-wait-for-cluster-to-be-ready)
   - [Failed on Adjust by-path selected install disk](#failed-on-adjust-by-path-selected-install-disk)
   - [Failed on Insert Virtual Media](#failed-on-insert-virtual-media)
-  - [Failing ImagePull due to Pull Secret](#failing-imagepull-due-to-pull-secret)
+  - [Failing ImagePull due to Pull Secret](#failing-imagepull-due-to-deactivated-pull-secret)
 - [Bastion](#bastion)
   - [Accessing services](#accessing-services)
   - [Clean all container services / podman pods](#clean-all-container-services--podman-pods)
@@ -275,9 +275,12 @@ racadm>>set iDRAC.VirtualMedia.Attached Attached
 Object value modified successfully
 ```
 
-## Failing ImagePull due to Pull Secret
+## Failing ImagePull due to Deactivated Pull Secret
 
-If a cluster has been running for some time or has changed hands between owners, there is a chance the pull secret supplied at install time may have expired.  If the cluster is degraded enough as a result, the control plane will not be able to update the kubelet's pull secret automatically.
+If a cluster has been running for some time or has changed hands between owners, there is a chance the pull secret supplied at install time has expired.
+Any attempt to update the pull secret by standard means (i.e. `oc edit -n openshift-config secret/pull-secret`) will not work.
+
+If the cluster is degraded enough as a result, the control plane will not be able to update the kubelet's pull secret automatically.
 
 For this emergency scenario, a playbook has been created that should hopefully help:
 ```

From a0d87ff88f42fa2a42ab47e3fa1011507ab72e45 Mon Sep 17 00:00:00 2001
From: Andrew Collins <ancollin@redhat.com>
Date: Thu, 20 Nov 2025 10:16:28 -0800
Subject: [PATCH 3/6] Revert ocp-scale-out-csr changes

Signed-off-by: Andrew Collins <ancollin@redhat.com>
---
 ansible/roles/ocp-scale-out-csr/tasks/main.yml | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/ansible/roles/ocp-scale-out-csr/tasks/main.yml b/ansible/roles/ocp-scale-out-csr/tasks/main.yml
index e5bc2f2f..13e64a94 100644
--- a/ansible/roles/ocp-scale-out-csr/tasks/main.yml
+++ b/ansible/roles/ocp-scale-out-csr/tasks/main.yml
@@ -1,11 +1,12 @@
 ---
-- name: Approve CSRs
+- name: Approve node-bootstrapper CSRs and wait for nodes to join cluster
   include_tasks: check_nodes_joined.yml
+  vars:
+    qry: "items[?status.conditions==null && spec.username == 'system:serviceaccount:openshift-machine-config-operator:node-bootstrapper']"
+    worker_counter: "grep worker | grep -v -c master"
 
-- name: Wait for expected number of workers to be Ready
-  shell: |
-    KUBECONFIG={{ bastion_cluster_config_dir }}/kubeconfig oc get nodes --no-headers -l node-role.kubernetes.io/worker | grep -c -v NotReady
-  register: oc_get_nodes_workers
-  until: oc_get_nodes_workers.stdout|int < current_worker_count+scale_out_count
-  retries: 540
-  delay: 30
+- name: Approve Kublet-serving CSRs and wait for nodes to join cluster
+  include_tasks: check_nodes_joined.yml
+  vars:
+    qry: "items[?status.conditions==null && spec.signerName == 'kubernetes.io/kubelet-serving']"
+    worker_counter: "grep worker | grep -v master | grep -c -v NotReady"

From 4a22cd2bababc5c00f14ea4f87d62dff4b48f95a Mon Sep 17 00:00:00 2001
From: Andrew Collins <ancollin@redhat.com>
Date: Thu, 20 Nov 2025 10:17:59 -0800
Subject: [PATCH 4/6] Revert set_hostname_role

Signed-off-by: Andrew Collins <ancollin@redhat.com>
---
 ansible/roles/wait-hosts-discovered/tasks/set_hostname_role.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ansible/roles/wait-hosts-discovered/tasks/set_hostname_role.yml b/ansible/roles/wait-hosts-discovered/tasks/set_hostname_role.yml
index fe951ca3..807b3b11 100644
--- a/ansible/roles/wait-hosts-discovered/tasks/set_hostname_role.yml
+++ b/ansible/roles/wait-hosts-discovered/tasks/set_hostname_role.yml
@@ -54,4 +54,3 @@
         "host_name": "{{ hostname }}",
         "host_role": "{{ host_role }}"
     }
-  ignore_errors: yes

From e3db4bd248e9ec70dbd4f1db4e79d9e32b122d04 Mon Sep 17 00:00:00 2001
From: Andrew Collins <ancollin@redhat.com>
Date: Thu, 20 Nov 2025 10:20:18 -0800
Subject: [PATCH 5/6] Revert ansible.cfg

Signed-off-by: Andrew Collins <ancollin@redhat.com>
---
 ansible.cfg | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ansible.cfg b/ansible.cfg
index 214988f0..60dc3a2e 100644
--- a/ansible.cfg
+++ b/ansible.cfg
@@ -1,6 +1,3 @@
 [defaults]
-interpreter_python=auto_silent
+interpreter_python=auto
 callbacks_enabled = profile_tasks
-deprecation_warnings = False
-log_path = ~/.ansible/jetlag-ansible.log
-display_args_to_stdout = True

From ffa0dce085b2f77c51953a972824e8cbf41a5954 Mon Sep 17 00:00:00 2001
From: Andrew Collins <ancollin@redhat.com>
Date: Thu, 20 Nov 2025 10:22:41 -0800
Subject: [PATCH 6/6] Added note about VM NICs into the docs

---
 docs/deploy-vmno.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/deploy-vmno.md b/docs/deploy-vmno.md
index 6d3bb724..2bae6e56 100644
--- a/docs/deploy-vmno.md
+++ b/docs/deploy-vmno.md
@@ -509,7 +509,10 @@ ansible/hv-vm-start-one.yml
 
 ## Disabling NetworkManager devices and connections for SR-IOV devices on VMs
 
-When VMs are created with SR-IOV devices using the IGB driver, the devices and connections will never fully initialize. NetworkManager repeatedly attempts to start them, which results in a large amount of churn on the VMs. A workaround to this churn is to force the devices down and connections' autoconnect off for those created for the interfaces.
+One option of creating SR-IOV capable interfaces in a VM is to create them using the Intel IGB driver.
+This may be achieved by setting the variable `vm_igb_nics: true` in your variables.
+
+**Please note:** When VMs are created with SR-IOV devices using the IGB driver, the devices and connections will never fully initialize. NetworkManager repeatedly attempts to start them, which results in a large amount of churn on the VMs. A workaround to this churn is to force the devices down and connections' autoconnect off for those created for the interfaces.
 
 See the following playbook:
 ```