Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions iac/provider-gcp/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ tf_vars := TF_VAR_environment=$(TERRAFORM_ENVIRONMENT) \
$(call tfvar, ALLOW_SANDBOX_INTERNET) \
$(call tfvar, API_RESOURCES_CPU_COUNT) \
$(call tfvar, API_RESOURCES_MEMORY_MB) \
$(call tfvar, INGRESS_COUNT) \
$(call tfvar, CLIENT_PROXY_COUNT) \
$(call tfvar, CLIENT_PROXY_RESOURCES_CPU_COUNT) \
$(call tfvar, CLIENT_PROXY_RESOURCES_MEMORY_MB) \
Expand Down
22 changes: 21 additions & 1 deletion iac/provider-gcp/init/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,27 @@ resource "google_secret_manager_secret_version" "analytics_collector_api_token"
depends_on = [time_sleep.secrets_api_wait_60_seconds]
}

resource "google_secret_manager_secret" "routing_domains" {
secret_id = "${var.prefix}routing-domains"

replication {
auto {}
}

depends_on = [time_sleep.secrets_api_wait_60_seconds]
}

resource "google_secret_manager_secret_version" "routing_domains" {
secret = google_secret_manager_secret.routing_domains.name
secret_data = jsonencode([])

lifecycle {
ignore_changes = [secret_data]
}

depends_on = [time_sleep.secrets_api_wait_60_seconds]
}

resource "google_artifact_registry_repository" "orchestration_repository" {
format = "DOCKER"
repository_id = "e2b-orchestration"
Expand All @@ -222,7 +243,6 @@ resource "time_sleep" "artifact_registry_api_wait_90_seconds" {
create_duration = "90s"
}


resource "google_artifact_registry_repository_iam_member" "orchestration_repository_member" {
repository = google_artifact_registry_repository.orchestration_repository.name
role = "roles/artifactregistry.reader"
Expand Down
4 changes: 4 additions & 0 deletions iac/provider-gcp/init/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ output "analytics_collector_api_token_secret_name" {
value = google_secret_manager_secret.analytics_collector_api_token.name
}

output "routing_domains_secret_name" {
value = google_secret_manager_secret.routing_domains.name
}

output "orchestration_repository_name" {
value = google_artifact_registry_repository.orchestration_repository.name
}
Expand Down
21 changes: 19 additions & 2 deletions iac/provider-gcp/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,19 @@ provider "google" {
zone = var.gcp_zone
}

data "google_secret_manager_secret_version" "routing_domains" {
secret = module.init.routing_domains_secret_name
}

locals {
// Taking additional domains from local env is there just for backward compatibility
additional_domains_from_secret = nonsensitive(jsondecode(data.google_secret_manager_secret_version.routing_domains.secret_data))
additional_domains_from_env = (var.additional_domains != "" ?
[for item in split(",", var.additional_domains) : trimspace(item)] : [])

additional_domains = distinct(concat(local.additional_domains_from_env, local.additional_domains_from_secret))
}

module "init" {
source = "./init"

Expand Down Expand Up @@ -108,16 +121,16 @@ module "cluster" {
api_use_nat = var.api_use_nat
api_nat_ips = var.api_nat_ips

ingress_port = var.ingress_port
edge_api_port = var.edge_api_port
edge_proxy_port = var.edge_proxy_port
api_port = var.api_port
docker_reverse_proxy_port = var.docker_reverse_proxy_port
nomad_port = var.nomad_port
google_service_account_email = module.init.service_account_email
domain_name = var.domain_name
additional_domains = (var.additional_domains != "" ?
[for item in split(",", var.additional_domains) : trimspace(item)] : [])

additional_domains = local.additional_domains
additional_api_services = (var.additional_api_services_json != "" ?
jsondecode(var.additional_api_services_json) :
[])
Expand Down Expand Up @@ -172,6 +185,10 @@ module "nomad" {
clickhouse_job_constraint_prefix = var.clickhouse_job_constraint_prefix
clickhouse_node_pool = var.clickhouse_node_pool

# Ingress
ingress_port = var.ingress_port
ingress_count = var.ingress_count

# API
api_resources_cpu_count = var.api_resources_cpu_count
api_resources_memory_mb = var.api_resources_memory_mb
Expand Down
1 change: 1 addition & 0 deletions iac/provider-gcp/nomad-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ module "network" {
api_use_nat = var.api_use_nat
api_nat_ips = var.api_nat_ips

ingress_port = var.ingress_port
api_port = var.api_port
docker_reverse_proxy_port = var.docker_reverse_proxy_port
network_name = var.network_name
Expand Down
58 changes: 58 additions & 0 deletions iac/provider-gcp/nomad-cluster/network/ingress.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
resource "google_compute_health_check" "ingress" {
name = "${var.prefix}ingress"

timeout_sec = 3
check_interval_sec = 5
healthy_threshold = 2
unhealthy_threshold = 2

http_health_check {
port = var.ingress_port.port
request_path = var.ingress_port.health_path
}
}

resource "google_compute_backend_service" "ingress" {
name = "${var.prefix}ingress"

protocol = "HTTP"
port_name = var.ingress_port.name

session_affinity = null
health_checks = [google_compute_health_check.ingress.id]

timeout_sec = 65

load_balancing_scheme = "EXTERNAL_MANAGED"
locality_lb_policy = "ROUND_ROBIN"

backend {
group = var.api_instance_group
}
}

resource "google_compute_url_map" "ingress" {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we get a better name here? I think we now have two load balancers, "ingress" and "orch_map", neither of their names help understand what they do. Maybe "traefik" and "direct"?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like ingress as its common name and it describes what it is. Yes, "orch_map" is, in my opinion, a mistake, as it no longer makes sense. Ideally, I would like to transition away from the current load balancer once the migration is complete/rename it to something like "ingress-sandboxes" or a similar name to distinguish better.

I don't like to call it Traefik, as we can switch the ingress backend at any time in the future, but I'm okay with you coming up with a better name.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a note here, if we want to rename the orch_map to ingress-sandboxes, maybe we should name this ingress something like ingress-api or ingress-management or ingress-services

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still don’t like that we would need two load balancers just because we cannot filter sandbox traffic. Will look into again tomorrow

Yep, we can rename ingress to something else. Iam not sure about management/api as we can use it for something different in future. Ingress services sounds okay to me.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I though it's actually quite nice to have separate LBs for user's sandbox traffic and our services traffic (different limitations, limits, HTTP support, etc), but maybe it's unnecessary

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, we should be able to match sandbox traffic to different rules (now it's catch-all fallback) so we can apply different limits/armor rules to them, then we don't need to have different LBs.

For supporting newer versions of HTTP, etc, we can still relatively easily migrate everything, and I'm not sure if we would need some special LB that cannot handle both sandbox and services traffic.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, discovered that GCP Armor policy rule allows you to filter based on host regex, so we can use one shared backend and apply dynamic rules based on the domain there. This will solve our issue with needing two load balancers. I would stick with ingress naming, as after migration is completed, we can remove orch-map as outdated.

Example of a regexp that can catch sandbox traffic and apply rate limiting. In the same way, we can create rules for API limiting and other related restrictions. The good thing is that this only appends rules to already existing security policy, so we can push rules even from a private monorepo that will handle block/rate limit for services that are not open source.

resource "google_compute_security_policy_rule" "sandbox-throttling-ip" {
  security_policy = google_compute_security_policy.default["session"].name
  action          = "throttle"
  priority        = "500"

  match {
    expr {
      expression = <<-EOT
request.headers["host"].matches("^(?i)[0-9]+-[a-z0-9-]+\\.e2b-jirka\\.dev$")
EOT
    }
  }

  rate_limit_options {
    conform_action = "allow"
    exceed_action  = "deny(429)"

    enforce_on_key = ""

    enforce_on_key_configs {
      enforce_on_key_type = "IP"
    }

    rate_limit_threshold {
      count        = 40000
      interval_sec = 60
    }
  }

  description = "Requests to sandboxes from IP address"
}

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Internal docs that will contain all info related to migration and current state -> https://www.notion.so/e2bdev/Ingress-Migration-288b8c296873807a8264f1615602d11d

name = "${var.prefix}ingress"
default_service = google_compute_backend_service.ingress.self_link
}

resource "google_compute_global_forwarding_rule" "ingress" {
name = "${var.prefix}ingress-forward-http"
ip_protocol = "TCP"
port_range = "443"
load_balancing_scheme = "EXTERNAL_MANAGED"
ip_address = google_compute_global_address.ingress_ipv4.address
target = google_compute_target_https_proxy.ingress.self_link
}

resource "google_compute_global_address" "ingress_ipv4" {
name = "${var.prefix}ingress-ipv4"
ip_version = "IPV4"
}

resource "google_compute_target_https_proxy" "ingress" {
name = "${var.prefix}ingress-https"
url_map = google_compute_url_map.ingress.self_link

certificate_map = "//certificatemanager.googleapis.com/${google_certificate_manager_certificate_map.certificate_map.id}"
}
5 changes: 5 additions & 0 deletions iac/provider-gcp/nomad-cluster/network/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,11 @@ resource "google_compute_firewall" "default-hc" {
}
}

allow {
protocol = "tcp"
ports = [var.ingress_port.port]
}

dynamic "allow" {
for_each = toset(var.additional_ports)

Expand Down
8 changes: 8 additions & 0 deletions iac/provider-gcp/nomad-cluster/network/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@ variable "api_port" {
})
}

variable "ingress_port" {
type = object({
name = string
port = number
health_path = string
})
}

variable "docker_reverse_proxy_port" {
type = object({
name = string
Expand Down
5 changes: 5 additions & 0 deletions iac/provider-gcp/nomad-cluster/nodepool-api.tf
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ resource "google_compute_instance_group_manager" "api_pool" {
port = var.docker_reverse_proxy_port.port
}

named_port {
name = var.ingress_port.name
port = var.ingress_port.port
}

dynamic "named_port" {
for_each = local.api_additional_ports
content {
Expand Down
8 changes: 8 additions & 0 deletions iac/provider-gcp/nomad-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,14 @@ variable "api_port" {
})
}

variable "ingress_port" {
type = object({
name = string
port = number
health_path = string
})
}

variable "docker_reverse_proxy_port" {
type = object({
name = string
Expand Down
95 changes: 95 additions & 0 deletions iac/provider-gcp/nomad/jobs/ingress.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
job "ingress" {
datacenters = ["${gcp_zone}"]
node_pool = "${node_pool}"
priority = 90

group "ingress" {
count = ${count}

constraint {
operator = "distinct_hosts"
value = "true"
}

network {
port "ingress" {
static = "${ingress_port}"
}

port "control" {
static = "${control_port}"
}
}

# https://developer.hashicorp.com/nomad/docs/job-specification/update
%{ if update_stanza }
update {
max_parallel = 1 # Update only 1 node at a time
}
%{ endif }

service {
port = "ingress"
name = "ingress"
task = "ingress"

check {
type = "http"
name = "health"
path = "/ping"
interval = "3s"
timeout = "3s"
port = "${ingress_port}"
}
}

task "ingress" {
driver = "docker"

%{ if update_stanza }
kill_timeout = "24h"
%{ endif }

kill_signal = "SIGTERM"

config {
network_mode = "host"
image = "traefik:v3.5"
ports = ["control", "ingress"]
args = [
# Entry-points that are set internally by Traefik
"--entrypoints.web.address=:${ingress_port}",
"--entrypoints.traefik.address=:${control_port}",

# Traefik internals (logging, metrics, ...)
"--api.dashboard=true",
"--api.insecure=false",

"--accesslog=true",
"--ping=true",
"--ping.entryPoint=web",
"--metrics=true",
"--metrics.prometheus=true",
"--metrics.prometheus.entryPoint=traefik",

# Traefik Nomad provider
"--providers.nomad=true",
"--providers.nomad.endpoint.address=${nomad_endpoint}",
"--providers.nomad.endpoint.token=${nomad_token}",

# Traefik Consul provider
"--providers.consulcatalog=true",
"--providers.consulcatalog.exposedByDefault=false",
"--providers.consulcatalog.endpoint.address=${consul_endpoint}",
"--providers.consulcatalog.endpoint.token=${consul_token}",
]
}

resources {
memory_max = ${memory_mb * 1.5}
memory = ${memory_mb}
cpu = ${cpu_count * 1000}
}
}
}
}
21 changes: 21 additions & 0 deletions iac/provider-gcp/nomad/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,27 @@ resource "docker_image" "db_migrator_image" {
platform = "linux/amd64/v8"
}

resource "nomad_job" "ingress" {
jobspec = templatefile("${path.module}/jobs/ingress.hcl",
{
count = var.ingress_count
update_stanza = var.api_machine_count > 1
cpu_count = 1
memory_mb = 512
node_pool = var.api_node_pool
gcp_zone = var.gcp_zone

ingress_port = var.ingress_port.port
control_port = 8900

nomad_endpoint = "http://localhost:4646"
nomad_token = var.nomad_acl_token_secret

consul_token = var.consul_acl_token_secret
consul_endpoint = "http://localhost:8500"
})
}

resource "nomad_job" "api" {
jobspec = templatefile("${path.module}/jobs/api.hcl", {
update_stanza = var.api_machine_count > 1
Expand Down
12 changes: 12 additions & 0 deletions iac/provider-gcp/nomad/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,18 @@ variable "api_port" {
})
}

variable "ingress_port" {
type = object({
name = string
port = number
health_path = string
})
}

variable "ingress_count" {
type = number
}

variable "api_resources_cpu_count" {
type = number
}
Expand Down
Loading