From 0a39a904179e3776ae75dc15ad460884be4dc733 Mon Sep 17 00:00:00 2001 From: upodroid Date: Sun, 5 Oct 2025 23:05:55 +0300 Subject: [PATCH] add single process oom kill kubeletconfig to GKE nodes --- .../gcp/terraform/k8s-infra-prow-build/iam.tf | 10 + .../terraform/k8s-infra-prow-build/main.tf | 199 +++++++++--------- .../{00-provider.tf => provider.tf} | 8 +- .../terraform/modules/gke-cluster/versions.tf | 4 +- .../modules/gke-nodepool/versions.tf | 4 +- .../terraform/modules/gke-project/versions.tf | 4 +- .../versions.tf | 4 +- 7 files changed, 126 insertions(+), 107 deletions(-) rename infra/gcp/terraform/k8s-infra-prow-build/{00-provider.tf => provider.tf} (82%) diff --git a/infra/gcp/terraform/k8s-infra-prow-build/iam.tf b/infra/gcp/terraform/k8s-infra-prow-build/iam.tf index 748a6d4efda..0f4cbd691c8 100644 --- a/infra/gcp/terraform/k8s-infra-prow-build/iam.tf +++ b/infra/gcp/terraform/k8s-infra-prow-build/iam.tf @@ -29,9 +29,19 @@ module "iam" { "serviceAccount:prow-control-plane@k8s-infra-prow.iam.gserviceaccount.com", "serviceAccount:prow-deployer@k8s-infra-prow-build-trusted.iam.gserviceaccount.com" ] + "roles/owner" = [ + "group:k8s-infra-prow-oncall@kubernetes.io" + ] + "organizations/758905017065/roles/prow.viewer" = [ + "group:k8s-infra-prow-viewers@kubernetes.io" + ] + "roles/viewer" = [ + "group:k8s-infra-prow-viewers@kubernetes.io" + ] "roles/secretmanager.secretAccessor" = [ "serviceAccount:kubernetes-external-secrets@k8s-infra-prow-build.iam.gserviceaccount.com", "principal://iam.googleapis.com/projects/${module.project.project_number}/locations/global/workloadIdentityPools/${module.project.project_id}.svc.id.goog/subject/ns/external-secrets/sa/external-secrets", + "principal://iam.googleapis.com/projects/180382678033/locations/global/workloadIdentityPools/k8s-infra-prow-build-trusted.svc.id.goog/subject/ns/external-secrets/sa/external-secrets", ] } } diff --git a/infra/gcp/terraform/k8s-infra-prow-build/main.tf b/infra/gcp/terraform/k8s-infra-prow-build/main.tf index ff00ac41d00..306d3e7b21c 100644 --- a/infra/gcp/terraform/k8s-infra-prow-build/main.tf +++ b/infra/gcp/terraform/k8s-infra-prow-build/main.tf @@ -31,41 +31,32 @@ locals { pod_namespace = "test-pods" // MUST match whatever prow is configured to use when it schedules to this cluster } -data "google_organization" "org" { - domain = "kubernetes.io" -} - module "project" { - source = "../modules/gke-project" - project_id = local.project_id - project_name = local.project_id -} - -// Ensure k8s-infra-prow-oncall@kuberentes.io has owner access to this project -resource "google_project_iam_member" "k8s_infra_prow_oncall" { - project = module.project.project_id - role = "roles/owner" - member = "group:k8s-infra-prow-oncall@kubernetes.io" -} - -// Role created by ensure-organization.sh, use a data source to ensure it exists -data "google_iam_role" "prow_viewer" { - name = "${data.google_organization.org.name}/roles/prow.viewer" -} - -// Ensure k8s-infra-prow-viewers@kuberentes.io has prow.viewer access to this project -resource "google_project_iam_member" "k8s_infra_prow_viewers" { - project = module.project.project_id - role = data.google_iam_role.prow_viewer.name - member = "group:k8s-infra-prow-viewers@kubernetes.io" -} - -// Allow prow-deployer service account in k8s-infra-prow-build-trusted to deploy -// to the cluster defined in here -resource "google_project_iam_member" "prow_deployer_for_prow_build" { - project = module.project.project_id - role = "roles/container.admin" - member = "serviceAccount:prow-deployer@k8s-infra-prow-build-trusted.iam.gserviceaccount.com" + source = "terraform-google-modules/project-factory/google" + version = "~> 18.0" + + name = "k8s-infra-prow-build" + project_id = "k8s-infra-prow-build" + folder_id = "411137699919" + billing_account = "018801-93540E-22A20E" + + # Sane project defaults + default_service_account = "keep" + disable_services_on_destroy = false + create_project_sa = false + random_project_id = false + auto_create_network = true + activate_apis = [ + "secretmanager.googleapis.com", + "cloudasset.googleapis.com", + "compute.googleapis.com", + "container.googleapis.com", + "cloudkms.googleapis.com", + "artifactregistry.googleapis.com", + "secretmanager.googleapis.com", + "cloudbuild.googleapis.com", + "bigquery.googleapis.com" + ] } module "prow_build_cluster" { @@ -80,47 +71,6 @@ module "prow_build_cluster" { cloud_shell_access = false } -module "prow_build_nodepool_c4_highmem_8_localssd" { - source = "../modules/gke-nodepool" - project_name = module.project.project_id - cluster_name = module.prow_build_cluster.cluster.name - location = module.prow_build_cluster.cluster.location - node_locations = [ - "us-central1-b", - "us-central1-c", - "us-central1-f", - ] - name = "pool6" - initial_count = 1 - min_count = 1 - max_count = 80 - machine_type = "c4-highmem-8" - disk_size_gb = 500 - disk_type = "hyperdisk-balanced" - service_account = module.prow_build_cluster.cluster_node_sa.email -} - -module "prow_build_nodepool_c4d_highmem_8_localssd" { - source = "../modules/gke-nodepool" - project_name = module.project.project_id - cluster_name = module.prow_build_cluster.cluster.name - location = module.prow_build_cluster.cluster.location - node_locations = [ - "us-central1-a", - "us-central1-b", - "us-central1-c", - ] - name = "pool7" - initial_count = 1 - min_count = 10 - max_count = 80 - machine_type = "c4d-highmem-8-lssd" # has 2 local ssd disks attached - disk_size_gb = 100 - disk_type = "hyperdisk-balanced" - service_account = module.prow_build_cluster.cluster_node_sa.email -} - - module "sig_node_node_pool_1_n4_highmem_8" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gke-nodepool?ref=v39.0.0&depth=1" @@ -160,24 +110,83 @@ module "sig_node_node_pool_1_n4_highmem_8" { taints = { dedicated = { value = "sig-node", effect = "NO_SCHEDULE" } } } -module "prow_build_nodepool_c4a_highmem_8_localssd" { - source = "../modules/gke-nodepool" - project_name = module.project.project_id - cluster_name = module.prow_build_cluster.cluster.name - location = module.prow_build_cluster.cluster.location - node_locations = [ - "us-central1-a", - "us-central1-b", - "us-central1-c", - ] - name = "pool7-arm64" - initial_count = 1 - min_count = 1 - max_count = 10 - machine_type = "c4a-highmem-8-lssd" # has 2 local ssd disks attached - disk_size_gb = 100 - disk_type = "hyperdisk-balanced" - // GKE automatically taints arm64 nodes - // https://cloud.google.com/kubernetes-engine/docs/how-to/prepare-arm-workloads-for-deployment#overview - service_account = module.prow_build_cluster.cluster_node_sa.email +module "prod_intel_pool" { + source = "terraform-google-modules/kubernetes-engine/google//modules/gke-node-pool" + version = "~> 40.0" + project_id = module.project.project_id + name = "pool8-intel" + cluster = module.prow_build_cluster.cluster.name + node_locations = ["us-central1-b", "us-central1-c", "us-central1-f"] + + autoscaling = { + max_node_count = 100 + min_node_count = 1 + } + + node_config = { + service_account = module.prow_build_cluster.cluster_node_sa.email + machine_type = "c4-highmem-8-lssd" + disk_type = "hyperdisk-balanced" + image_type = "COS_CONTAINERD" + kubelet_config = { + single_process_oom_kill = false # https://github.com/kubernetes-sigs/prow/issues/210 + } + shielded_instance_config = { + enable_secure_boot = true + } + } +} + +module "prod_amd_pool" { + source = "terraform-google-modules/kubernetes-engine/google//modules/gke-node-pool" + version = "~> 40.0" + project_id = module.project.project_id + name = "pool8-amd" + cluster = module.prow_build_cluster.cluster.name + node_locations = ["us-central1-b", "us-central1-c", "us-central1-f"] + + autoscaling = { + max_node_count = 100 + min_node_count = 1 + } + + node_config = { + service_account = module.prow_build_cluster.cluster_node_sa.email + machine_type = "c4d-highmem-8-lssd" + disk_type = "hyperdisk-balanced" + image_type = "COS_CONTAINERD" + kubelet_config = { + single_process_oom_kill = false # https://github.com/kubernetes-sigs/prow/issues/210 + } + shielded_instance_config = { + enable_secure_boot = true + } + } +} + +module "prod_arm_pool" { + source = "terraform-google-modules/kubernetes-engine/google//modules/gke-node-pool" + version = "~> 40.0" + project_id = module.project.project_id + name = "pool8-arm" + cluster = module.prow_build_cluster.cluster.name + node_locations = ["us-central1-b", "us-central1-c", "us-central1-f"] + + autoscaling = { + max_node_count = 100 + min_node_count = 1 + } + + node_config = { + service_account = module.prow_build_cluster.cluster_node_sa.email + machine_type = "c4a-highmem-8-lssd" + disk_type = "hyperdisk-balanced" + image_type = "COS_CONTAINERD" + kubelet_config = { + single_process_oom_kill = false # https://github.com/kubernetes-sigs/prow/issues/210 + } + shielded_instance_config = { + enable_secure_boot = true + } + } } diff --git a/infra/gcp/terraform/k8s-infra-prow-build/00-provider.tf b/infra/gcp/terraform/k8s-infra-prow-build/provider.tf similarity index 82% rename from infra/gcp/terraform/k8s-infra-prow-build/00-provider.tf rename to infra/gcp/terraform/k8s-infra-prow-build/provider.tf index cac23ac9146..9e69989eaa3 100644 --- a/infra/gcp/terraform/k8s-infra-prow-build/00-provider.tf +++ b/infra/gcp/terraform/k8s-infra-prow-build/provider.tf @@ -23,18 +23,18 @@ This file defines: terraform { backend "gcs" { - bucket = "k8s-infra-tf-prow-clusters" - prefix = "k8s-infra-prow-build/prow-build" // $project_name/$cluster_name + bucket = "k8s-infra-terraform" + prefix = "k8s-infra-prow-build" } required_providers { google = { source = "hashicorp/google" - version = "~> 6.31.0" + version = "~> 6.50.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 6.31.0" + version = "~> 6.50.0" } } } diff --git a/infra/gcp/terraform/modules/gke-cluster/versions.tf b/infra/gcp/terraform/modules/gke-cluster/versions.tf index ca25a20458f..57347e35662 100644 --- a/infra/gcp/terraform/modules/gke-cluster/versions.tf +++ b/infra/gcp/terraform/modules/gke-cluster/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 6.31.0" + version = "~> 6.50.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 6.31.0" + version = "~> 6.50.0" } } } diff --git a/infra/gcp/terraform/modules/gke-nodepool/versions.tf b/infra/gcp/terraform/modules/gke-nodepool/versions.tf index ca25a20458f..57347e35662 100644 --- a/infra/gcp/terraform/modules/gke-nodepool/versions.tf +++ b/infra/gcp/terraform/modules/gke-nodepool/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 6.31.0" + version = "~> 6.50.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 6.31.0" + version = "~> 6.50.0" } } } diff --git a/infra/gcp/terraform/modules/gke-project/versions.tf b/infra/gcp/terraform/modules/gke-project/versions.tf index ca25a20458f..57347e35662 100644 --- a/infra/gcp/terraform/modules/gke-project/versions.tf +++ b/infra/gcp/terraform/modules/gke-project/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 6.31.0" + version = "~> 6.50.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 6.31.0" + version = "~> 6.50.0" } } } diff --git a/infra/gcp/terraform/modules/workload-identity-service-account/versions.tf b/infra/gcp/terraform/modules/workload-identity-service-account/versions.tf index 7cff530704e..f7f05da0caa 100644 --- a/infra/gcp/terraform/modules/workload-identity-service-account/versions.tf +++ b/infra/gcp/terraform/modules/workload-identity-service-account/versions.tf @@ -17,11 +17,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 6.31.0" + version = "~>6.50.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 6.31.0" + version = "~> 6.50.0" } } }