From e32afe7702d6b67b9d01a6b799fb8c1fc266621e Mon Sep 17 00:00:00 2001 From: Nathan Flurry Date: Mon, 16 Jun 2025 20:55:37 +0000 Subject: [PATCH 1/3] chore(pegboard): replace computing image size manually with using tar bytes read From fbd6691cca438fb370a0085437b7bddf3da02bd8 Mon Sep 17 00:00:00 2001 From: Nathan Flurry Date: Thu, 5 Jun 2025 05:57:52 +0000 Subject: [PATCH 2/3] feat: implement `clickhouse-user-query` --- Cargo.lock | 75 +++++++++++++++++++++++++++++++++++++++++++++++++----- Cargo.toml | 2 +- 2 files changed, 69 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 799e73dc4c..607a19ca03 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3273,6 +3273,12 @@ dependencies = [ "inout", ] +[[package]] +name = "cityhash-rs" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93a719913643003b84bd13022b4b7e703c09342cd03b679c4641c7d2e50dc34d" + [[package]] name = "cjson" version = "0.1.2" @@ -3345,13 +3351,13 @@ checksum = "a0875e527e299fc5f4faba42870bf199a39ab0bb2dbba1b8aef0a2151451130f" dependencies = [ "bstr", "bytes", - "clickhouse-derive", + "clickhouse-derive 0.1.1", "clickhouse-rs-cityhash-sys", "futures", "hyper 0.14.31", "hyper-tls 0.5.0", "lz4", - "sealed", + "sealed 0.4.0", "serde", "static_assertions", "thiserror 1.0.69", @@ -3360,6 +3366,31 @@ dependencies = [ "uuid", ] +[[package]] +name = "clickhouse" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3093f817c4f81c8bd174ed8dd30eac785821a8a7eef27a7dcb7f8cd0d0f6548" +dependencies = [ + "bstr", + "bytes", + "cityhash-rs", + "clickhouse-derive 0.2.0", + "futures", + "futures-channel", + "http-body-util", + "hyper 1.6.0", + "hyper-util", + "lz4_flex", + "replace_with", + "sealed 0.5.0", + "serde", + "static_assertions", + "thiserror 1.0.69", + "tokio", + "url", +] + [[package]] name = "clickhouse-derive" version = "0.1.1" @@ -3372,6 +3403,18 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "clickhouse-derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d70f3e2893f7d3e017eeacdc9a708fbc29a10488e3ebca21f9df6a5d2b616dbb" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals 0.29.1", + "syn 2.0.90", +] + [[package]] name = "clickhouse-inserter" version = "25.5.2" @@ -3400,9 +3443,9 @@ dependencies = [ [[package]] name = "clickhouse-user-query" -version = "25.5.2" +version = "25.4.2" dependencies = [ - "clickhouse", + "clickhouse 0.12.2", "serde", "serde_json", "testcontainers 0.24.0", @@ -8356,7 +8399,7 @@ dependencies = [ "chirp-client", "chirp-worker", "chrono", - "clickhouse", + "clickhouse 0.11.6", "prost 0.10.4", "rivet-operation", "serde", @@ -8368,7 +8411,7 @@ version = "25.5.2" dependencies = [ "chirp-client", "chirp-worker", - "clickhouse", + "clickhouse 0.11.6", "reqwest 0.11.27", "rivet-config", "rivet-health-checks", @@ -12070,6 +12113,12 @@ dependencies = [ "sqlx", ] +[[package]] +name = "replace_with" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51743d3e274e2b18df81c4dc6caf8a5b8e15dbe799e0dca05c7617380094e884" + [[package]] name = "reqwest" version = "0.11.27" @@ -12840,7 +12889,7 @@ version = "25.5.2" dependencies = [ "anyhow", "async-nats", - "clickhouse", + "clickhouse 0.11.6", "clickhouse-inserter", "dirs", "divan", @@ -13757,6 +13806,18 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "sealed" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4a8caec23b7800fb97971a1c6ae365b6239aaeddfb934d6265f8505e795699d" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "sec1" version = "0.3.0" diff --git a/Cargo.toml b/Cargo.toml index 42349bff27..920901a689 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [workspace] resolver = "2" -members = ["packages/common/api-helper/build","packages/common/api-helper/macros","packages/common/cache/build","packages/common/cache/result","packages/common/chirp-workflow/core","packages/common/chirp-workflow/macros","packages/common/chirp/client","packages/common/chirp/metrics","packages/common/chirp/perf","packages/common/chirp/types","packages/common/chirp/worker","packages/common/chirp/worker-attributes","packages/common/claims","packages/common/clickhouse-inserter","packages/common/clickhouse-user-query","packages/common/config","packages/common/connection","packages/common/convert","packages/common/deno-embed","packages/common/env","packages/common/fdb-util","packages/common/formatted-error","packages/common/global-error","packages/common/health-checks","packages/common/kv-str","packages/common/logs","packages/common/metrics","packages/common/migrate","packages/common/nomad-util","packages/common/operation/core","packages/common/operation/macros","packages/common/pools","packages/common/redis-util","packages/common/runtime","packages/common/s3-util","packages/common/schemac","packages/common/server-cli","packages/common/service-discovery","packages/common/service-manager","packages/common/smithy-output/api-auth/rust","packages/common/smithy-output/api-auth/rust-server","packages/common/smithy-output/api-cf-verification/rust","packages/common/smithy-output/api-cf-verification/rust-server","packages/common/smithy-output/api-cloud/rust","packages/common/smithy-output/api-cloud/rust-server","packages/common/smithy-output/api-group/rust","packages/common/smithy-output/api-group/rust-server","packages/common/smithy-output/api-identity/rust","packages/common/smithy-output/api-identity/rust-server","packages/common/smithy-output/api-job/rust","packages/common/smithy-output/api-job/rust-server","packages/common/smithy-output/api-kv/rust","packages/common/smithy-output/api-kv/rust-server","packages/common/smithy-output/api-matchmaker/rust","packages/common/smithy-output/api-matchmaker/rust-server","packages/common/smithy-output/api-party/rust","packages/common/smithy-output/api-party/rust-server","packages/common/smithy-output/api-portal/rust","packages/common/smithy-output/api-portal/rust-server","packages/common/smithy-output/api-status/rust","packages/common/smithy-output/api-status/rust-server","packages/common/smithy-output/api-traefik-provider/rust","packages/common/smithy-output/api-traefik-provider/rust-server","packages/common/test","packages/common/test-images","packages/common/types-proto/build","packages/common/types-proto/core","packages/common/util/core","packages/common/util/macros","packages/common/util/search","packages/core/api/actor","packages/core/api/auth","packages/core/api/cf-verification","packages/core/api/cloud","packages/core/api/games","packages/core/api/group","packages/core/api/identity","packages/core/api/intercom","packages/core/api/job","packages/core/api/matchmaker","packages/core/api/monolith-edge","packages/core/api/monolith-public","packages/core/api/portal","packages/core/api/provision","packages/core/api/status","packages/core/api/traefik-provider","packages/core/api/ui","packages/core/infra/legacy/job-runner","packages/core/infra/schema-generator","packages/core/infra/server","packages/core/services/build","packages/core/services/build/ops/create","packages/core/services/build/ops/get","packages/core/services/build/ops/list-for-env","packages/core/services/build/ops/list-for-game","packages/core/services/build/standalone/default-create","packages/core/services/build/util","packages/core/services/captcha/ops/hcaptcha-config-get","packages/core/services/captcha/ops/hcaptcha-verify","packages/core/services/captcha/ops/request","packages/core/services/captcha/ops/turnstile-config-get","packages/core/services/captcha/ops/turnstile-verify","packages/core/services/captcha/ops/verify","packages/core/services/captcha/util","packages/core/services/cdn/ops/namespace-auth-user-remove","packages/core/services/cdn/ops/namespace-auth-user-update","packages/core/services/cdn/ops/namespace-create","packages/core/services/cdn/ops/namespace-domain-create","packages/core/services/cdn/ops/namespace-domain-remove","packages/core/services/cdn/ops/namespace-get","packages/core/services/cdn/ops/namespace-resolve-domain","packages/core/services/cdn/ops/ns-auth-type-set","packages/core/services/cdn/ops/ns-enable-domain-public-auth-set","packages/core/services/cdn/ops/site-create","packages/core/services/cdn/ops/site-get","packages/core/services/cdn/ops/site-list-for-game","packages/core/services/cdn/ops/version-get","packages/core/services/cdn/ops/version-prepare","packages/core/services/cdn/ops/version-publish","packages/core/services/cdn/util","packages/core/services/cdn/worker","packages/core/services/cf-custom-hostname/ops/get","packages/core/services/cf-custom-hostname/ops/list-for-namespace-id","packages/core/services/cf-custom-hostname/ops/resolve-hostname","packages/core/services/cf-custom-hostname/worker","packages/core/services/cloud/ops/device-link-create","packages/core/services/cloud/ops/game-config-create","packages/core/services/cloud/ops/game-config-get","packages/core/services/cloud/ops/game-token-create","packages/core/services/cloud/ops/namespace-create","packages/core/services/cloud/ops/namespace-get","packages/core/services/cloud/ops/namespace-token-development-create","packages/core/services/cloud/ops/namespace-token-public-create","packages/core/services/cloud/ops/version-get","packages/core/services/cloud/ops/version-publish","packages/core/services/cloud/standalone/default-create","packages/core/services/cloud/worker","packages/core/services/cluster","packages/core/services/cluster/standalone/datacenter-tls-renew","packages/core/services/cluster/standalone/default-update","packages/core/services/cluster/standalone/gc","packages/core/services/cluster/standalone/metrics-publish","packages/core/services/custom-user-avatar/ops/list-for-game","packages/core/services/custom-user-avatar/ops/upload-complete","packages/core/services/debug/ops/email-res","packages/core/services/dynamic-config","packages/core/services/email-verification/ops/complete","packages/core/services/email-verification/ops/create","packages/core/services/email/ops/send","packages/core/services/external/ops/request-validate","packages/core/services/external/worker","packages/core/services/faker/ops/build","packages/core/services/faker/ops/cdn-site","packages/core/services/faker/ops/game","packages/core/services/faker/ops/game-namespace","packages/core/services/faker/ops/game-version","packages/core/services/faker/ops/job-run","packages/core/services/faker/ops/job-template","packages/core/services/faker/ops/mm-lobby","packages/core/services/faker/ops/mm-lobby-row","packages/core/services/faker/ops/mm-player","packages/core/services/faker/ops/region","packages/core/services/faker/ops/team","packages/core/services/faker/ops/user","packages/core/services/game/ops/banner-upload-complete","packages/core/services/game/ops/create","packages/core/services/game/ops/get","packages/core/services/game/ops/list-all","packages/core/services/game/ops/list-for-team","packages/core/services/game/ops/logo-upload-complete","packages/core/services/game/ops/namespace-create","packages/core/services/game/ops/namespace-get","packages/core/services/game/ops/namespace-list","packages/core/services/game/ops/namespace-resolve-name-id","packages/core/services/game/ops/namespace-resolve-url","packages/core/services/game/ops/namespace-validate","packages/core/services/game/ops/namespace-version-history-list","packages/core/services/game/ops/namespace-version-set","packages/core/services/game/ops/recommend","packages/core/services/game/ops/resolve-name-id","packages/core/services/game/ops/resolve-namespace-id","packages/core/services/game/ops/token-development-validate","packages/core/services/game/ops/validate","packages/core/services/game/ops/version-create","packages/core/services/game/ops/version-get","packages/core/services/game/ops/version-list","packages/core/services/game/ops/version-validate","packages/core/services/guard","packages/core/services/ip/ops/info","packages/core/services/job-log/ops/read","packages/core/services/job-log/worker","packages/core/services/job-run","packages/core/services/job/standalone/gc","packages/core/services/job/util","packages/core/services/linode","packages/core/services/linode/standalone/gc","packages/core/services/load-test/standalone/api-cloud","packages/core/services/load-test/standalone/mm","packages/core/services/load-test/standalone/mm-sustain","packages/core/services/load-test/standalone/sqlx","packages/core/services/load-test/standalone/watch-requests","packages/core/services/mm-config/ops/game-get","packages/core/services/mm-config/ops/game-upsert","packages/core/services/mm-config/ops/lobby-group-get","packages/core/services/mm-config/ops/lobby-group-resolve-name-id","packages/core/services/mm-config/ops/lobby-group-resolve-version","packages/core/services/mm-config/ops/namespace-config-set","packages/core/services/mm-config/ops/namespace-config-validate","packages/core/services/mm-config/ops/namespace-create","packages/core/services/mm-config/ops/namespace-get","packages/core/services/mm-config/ops/version-get","packages/core/services/mm-config/ops/version-prepare","packages/core/services/mm-config/ops/version-publish","packages/core/services/mm/ops/dev-player-token-create","packages/core/services/mm/ops/lobby-find-fail","packages/core/services/mm/ops/lobby-find-lobby-query-list","packages/core/services/mm/ops/lobby-find-try-complete","packages/core/services/mm/ops/lobby-for-run-id","packages/core/services/mm/ops/lobby-get","packages/core/services/mm/ops/lobby-history","packages/core/services/mm/ops/lobby-idle-update","packages/core/services/mm/ops/lobby-list-for-namespace","packages/core/services/mm/ops/lobby-list-for-user-id","packages/core/services/mm/ops/lobby-player-count","packages/core/services/mm/ops/lobby-runtime-aggregate","packages/core/services/mm/ops/lobby-state-get","packages/core/services/mm/ops/player-count-for-namespace","packages/core/services/mm/ops/player-get","packages/core/services/mm/standalone/gc","packages/core/services/mm/util","packages/core/services/mm/worker","packages/core/services/monolith/standalone/worker","packages/core/services/monolith/standalone/workflow-worker","packages/core/services/nomad/standalone/monitor","packages/core/services/region/ops/get","packages/core/services/region/ops/list","packages/core/services/region/ops/list-for-game","packages/core/services/region/ops/recommend","packages/core/services/region/ops/resolve","packages/core/services/region/ops/resolve-for-game","packages/core/services/route","packages/core/services/server-spec","packages/core/services/team-invite/ops/get","packages/core/services/team-invite/worker","packages/core/services/team/ops/avatar-upload-complete","packages/core/services/team/ops/get","packages/core/services/team/ops/join-request-list","packages/core/services/team/ops/member-count","packages/core/services/team/ops/member-get","packages/core/services/team/ops/member-list","packages/core/services/team/ops/member-relationship-get","packages/core/services/team/ops/profile-validate","packages/core/services/team/ops/recommend","packages/core/services/team/ops/resolve-display-name","packages/core/services/team/ops/user-ban-get","packages/core/services/team/ops/user-ban-list","packages/core/services/team/ops/validate","packages/core/services/team/util","packages/core/services/team/worker","packages/core/services/telemetry/standalone/beacon","packages/core/services/tier","packages/core/services/token/ops/create","packages/core/services/token/ops/exchange","packages/core/services/token/ops/get","packages/core/services/token/ops/revoke","packages/core/services/upload/ops/complete","packages/core/services/upload/ops/file-list","packages/core/services/upload/ops/get","packages/core/services/upload/ops/list-for-user","packages/core/services/upload/ops/prepare","packages/core/services/upload/worker","packages/core/services/user","packages/core/services/user-identity/ops/create","packages/core/services/user-identity/ops/delete","packages/core/services/user-identity/ops/get","packages/core/services/user/ops/avatar-upload-complete","packages/core/services/user/ops/get","packages/core/services/user/ops/pending-delete-toggle","packages/core/services/user/ops/profile-validate","packages/core/services/user/ops/resolve-email","packages/core/services/user/ops/team-list","packages/core/services/user/ops/token-create","packages/core/services/user/standalone/delete-pending","packages/core/services/user/worker","packages/edge/api/actor","packages/edge/api/intercom","packages/edge/api/monolith-edge","packages/edge/api/monolith-public","packages/edge/api/traefik-provider","packages/edge/infra/client/actor-kv","packages/edge/infra/client/config","packages/edge/infra/client/container-runner","packages/edge/infra/client/echo","packages/edge/infra/client/isolate-v8-runner","packages/edge/infra/client/manager","packages/edge/infra/edge-server","packages/edge/infra/guard/core","packages/edge/infra/guard/server","packages/edge/services/monolith/standalone/workflow-worker","packages/edge/services/pegboard","packages/edge/services/pegboard/standalone/usage-metrics-publish","packages/edge/services/pegboard/standalone/ws","packages/toolchain/cli","packages/toolchain/js-utils-embed","packages/toolchain/toolchain","sdks/api/full/rust"] +members = ["packages/common/api-helper/build","packages/common/api-helper/macros","packages/common/cache/build","packages/common/cache/result","packages/common/chirp-workflow/core","packages/common/chirp-workflow/macros","packages/common/chirp/client","packages/common/chirp/metrics","packages/common/chirp/perf","packages/common/chirp/types","packages/common/chirp/worker","packages/common/chirp/worker-attributes","packages/common/claims","packages/common/clickhouse-inserter","packages/common/clickhouse-user-query","packages/common/config","packages/common/connection","packages/common/convert","packages/common/deno-embed","packages/common/env","packages/common/fdb-util","packages/common/formatted-error","packages/common/global-error","packages/common/health-checks","packages/common/hub-embed","packages/common/kv-str","packages/common/logs","packages/common/metrics","packages/common/migrate","packages/common/nomad-util","packages/common/operation/core","packages/common/operation/macros","packages/common/pools","packages/common/redis-util","packages/common/runtime","packages/common/s3-util","packages/common/schemac","packages/common/server-cli","packages/common/service-discovery","packages/common/service-manager","packages/common/smithy-output/api-auth/rust","packages/common/smithy-output/api-auth/rust-server","packages/common/smithy-output/api-cf-verification/rust","packages/common/smithy-output/api-cf-verification/rust-server","packages/common/smithy-output/api-cloud/rust","packages/common/smithy-output/api-cloud/rust-server","packages/common/smithy-output/api-group/rust","packages/common/smithy-output/api-group/rust-server","packages/common/smithy-output/api-identity/rust","packages/common/smithy-output/api-identity/rust-server","packages/common/smithy-output/api-job/rust","packages/common/smithy-output/api-job/rust-server","packages/common/smithy-output/api-kv/rust","packages/common/smithy-output/api-kv/rust-server","packages/common/smithy-output/api-matchmaker/rust","packages/common/smithy-output/api-matchmaker/rust-server","packages/common/smithy-output/api-party/rust","packages/common/smithy-output/api-party/rust-server","packages/common/smithy-output/api-portal/rust","packages/common/smithy-output/api-portal/rust-server","packages/common/smithy-output/api-status/rust","packages/common/smithy-output/api-status/rust-server","packages/common/smithy-output/api-traefik-provider/rust","packages/common/smithy-output/api-traefik-provider/rust-server","packages/common/test","packages/common/test-images","packages/common/types-proto/build","packages/common/types-proto/core","packages/common/util/core","packages/common/util/macros","packages/common/util/search","packages/core/api/actor","packages/core/api/auth","packages/core/api/cf-verification","packages/core/api/cloud","packages/core/api/games","packages/core/api/group","packages/core/api/identity","packages/core/api/intercom","packages/core/api/job","packages/core/api/matchmaker","packages/core/api/monolith-edge","packages/core/api/monolith-public","packages/core/api/portal","packages/core/api/provision","packages/core/api/status","packages/core/api/traefik-provider","packages/core/api/ui","packages/core/infra/legacy/job-runner","packages/core/infra/schema-generator","packages/core/infra/server","packages/core/services/build","packages/core/services/build/ops/create","packages/core/services/build/ops/get","packages/core/services/build/ops/list-for-env","packages/core/services/build/ops/list-for-game","packages/core/services/build/standalone/default-create","packages/core/services/build/util","packages/core/services/captcha/ops/hcaptcha-config-get","packages/core/services/captcha/ops/hcaptcha-verify","packages/core/services/captcha/ops/request","packages/core/services/captcha/ops/turnstile-config-get","packages/core/services/captcha/ops/turnstile-verify","packages/core/services/captcha/ops/verify","packages/core/services/captcha/util","packages/core/services/cdn/ops/namespace-auth-user-remove","packages/core/services/cdn/ops/namespace-auth-user-update","packages/core/services/cdn/ops/namespace-create","packages/core/services/cdn/ops/namespace-domain-create","packages/core/services/cdn/ops/namespace-domain-remove","packages/core/services/cdn/ops/namespace-get","packages/core/services/cdn/ops/namespace-resolve-domain","packages/core/services/cdn/ops/ns-auth-type-set","packages/core/services/cdn/ops/ns-enable-domain-public-auth-set","packages/core/services/cdn/ops/site-create","packages/core/services/cdn/ops/site-get","packages/core/services/cdn/ops/site-list-for-game","packages/core/services/cdn/ops/version-get","packages/core/services/cdn/ops/version-prepare","packages/core/services/cdn/ops/version-publish","packages/core/services/cdn/util","packages/core/services/cdn/worker","packages/core/services/cf-custom-hostname/ops/get","packages/core/services/cf-custom-hostname/ops/list-for-namespace-id","packages/core/services/cf-custom-hostname/ops/resolve-hostname","packages/core/services/cf-custom-hostname/worker","packages/core/services/cloud/ops/device-link-create","packages/core/services/cloud/ops/game-config-create","packages/core/services/cloud/ops/game-config-get","packages/core/services/cloud/ops/game-token-create","packages/core/services/cloud/ops/namespace-create","packages/core/services/cloud/ops/namespace-get","packages/core/services/cloud/ops/namespace-token-development-create","packages/core/services/cloud/ops/namespace-token-public-create","packages/core/services/cloud/ops/version-get","packages/core/services/cloud/ops/version-publish","packages/core/services/cloud/standalone/default-create","packages/core/services/cloud/worker","packages/core/services/cluster","packages/core/services/cluster/standalone/datacenter-tls-renew","packages/core/services/cluster/standalone/default-update","packages/core/services/cluster/standalone/gc","packages/core/services/cluster/standalone/metrics-publish","packages/core/services/custom-user-avatar/ops/list-for-game","packages/core/services/custom-user-avatar/ops/upload-complete","packages/core/services/debug/ops/email-res","packages/core/services/dynamic-config","packages/core/services/email-verification/ops/complete","packages/core/services/email-verification/ops/create","packages/core/services/email/ops/send","packages/core/services/external/ops/request-validate","packages/core/services/external/worker","packages/core/services/faker/ops/build","packages/core/services/faker/ops/cdn-site","packages/core/services/faker/ops/game","packages/core/services/faker/ops/game-namespace","packages/core/services/faker/ops/game-version","packages/core/services/faker/ops/job-run","packages/core/services/faker/ops/job-template","packages/core/services/faker/ops/mm-lobby","packages/core/services/faker/ops/mm-lobby-row","packages/core/services/faker/ops/mm-player","packages/core/services/faker/ops/region","packages/core/services/faker/ops/team","packages/core/services/faker/ops/user","packages/core/services/game/ops/banner-upload-complete","packages/core/services/game/ops/create","packages/core/services/game/ops/get","packages/core/services/game/ops/list-all","packages/core/services/game/ops/list-for-team","packages/core/services/game/ops/logo-upload-complete","packages/core/services/game/ops/namespace-create","packages/core/services/game/ops/namespace-get","packages/core/services/game/ops/namespace-list","packages/core/services/game/ops/namespace-resolve-name-id","packages/core/services/game/ops/namespace-resolve-url","packages/core/services/game/ops/namespace-validate","packages/core/services/game/ops/namespace-version-history-list","packages/core/services/game/ops/namespace-version-set","packages/core/services/game/ops/recommend","packages/core/services/game/ops/resolve-name-id","packages/core/services/game/ops/resolve-namespace-id","packages/core/services/game/ops/token-development-validate","packages/core/services/game/ops/validate","packages/core/services/game/ops/version-create","packages/core/services/game/ops/version-get","packages/core/services/game/ops/version-list","packages/core/services/game/ops/version-validate","packages/core/services/ip/ops/info","packages/core/services/job-log/ops/read","packages/core/services/job-log/worker","packages/core/services/job-run","packages/core/services/job/standalone/gc","packages/core/services/job/util","packages/core/services/linode","packages/core/services/linode/standalone/gc","packages/core/services/load-test/standalone/api-cloud","packages/core/services/load-test/standalone/mm","packages/core/services/load-test/standalone/mm-sustain","packages/core/services/load-test/standalone/sqlx","packages/core/services/load-test/standalone/watch-requests","packages/core/services/mm-config/ops/game-get","packages/core/services/mm-config/ops/game-upsert","packages/core/services/mm-config/ops/lobby-group-get","packages/core/services/mm-config/ops/lobby-group-resolve-name-id","packages/core/services/mm-config/ops/lobby-group-resolve-version","packages/core/services/mm-config/ops/namespace-config-set","packages/core/services/mm-config/ops/namespace-config-validate","packages/core/services/mm-config/ops/namespace-create","packages/core/services/mm-config/ops/namespace-get","packages/core/services/mm-config/ops/version-get","packages/core/services/mm-config/ops/version-prepare","packages/core/services/mm-config/ops/version-publish","packages/core/services/mm/ops/dev-player-token-create","packages/core/services/mm/ops/lobby-find-fail","packages/core/services/mm/ops/lobby-find-lobby-query-list","packages/core/services/mm/ops/lobby-find-try-complete","packages/core/services/mm/ops/lobby-for-run-id","packages/core/services/mm/ops/lobby-get","packages/core/services/mm/ops/lobby-history","packages/core/services/mm/ops/lobby-idle-update","packages/core/services/mm/ops/lobby-list-for-namespace","packages/core/services/mm/ops/lobby-list-for-user-id","packages/core/services/mm/ops/lobby-player-count","packages/core/services/mm/ops/lobby-runtime-aggregate","packages/core/services/mm/ops/lobby-state-get","packages/core/services/mm/ops/player-count-for-namespace","packages/core/services/mm/ops/player-get","packages/core/services/mm/standalone/gc","packages/core/services/mm/util","packages/core/services/mm/worker","packages/core/services/monolith/standalone/worker","packages/core/services/monolith/standalone/workflow-worker","packages/core/services/nomad/standalone/monitor","packages/core/services/region/ops/get","packages/core/services/region/ops/list","packages/core/services/region/ops/list-for-game","packages/core/services/region/ops/recommend","packages/core/services/region/ops/resolve","packages/core/services/region/ops/resolve-for-game","packages/core/services/route","packages/core/services/server-spec","packages/core/services/team-invite/ops/get","packages/core/services/team-invite/worker","packages/core/services/team/ops/avatar-upload-complete","packages/core/services/team/ops/get","packages/core/services/team/ops/join-request-list","packages/core/services/team/ops/member-count","packages/core/services/team/ops/member-get","packages/core/services/team/ops/member-list","packages/core/services/team/ops/member-relationship-get","packages/core/services/team/ops/profile-validate","packages/core/services/team/ops/recommend","packages/core/services/team/ops/resolve-display-name","packages/core/services/team/ops/user-ban-get","packages/core/services/team/ops/user-ban-list","packages/core/services/team/ops/validate","packages/core/services/team/util","packages/core/services/team/worker","packages/core/services/telemetry/standalone/beacon","packages/core/services/tier","packages/core/services/token/ops/create","packages/core/services/token/ops/exchange","packages/core/services/token/ops/get","packages/core/services/token/ops/revoke","packages/core/services/upload/ops/complete","packages/core/services/upload/ops/file-list","packages/core/services/upload/ops/get","packages/core/services/upload/ops/list-for-user","packages/core/services/upload/ops/prepare","packages/core/services/upload/worker","packages/core/services/user","packages/core/services/user-identity/ops/create","packages/core/services/user-identity/ops/delete","packages/core/services/user-identity/ops/get","packages/core/services/user/ops/avatar-upload-complete","packages/core/services/user/ops/get","packages/core/services/user/ops/pending-delete-toggle","packages/core/services/user/ops/profile-validate","packages/core/services/user/ops/resolve-email","packages/core/services/user/ops/team-list","packages/core/services/user/ops/token-create","packages/core/services/user/standalone/delete-pending","packages/core/services/user/worker","packages/edge/api/actor","packages/edge/api/intercom","packages/edge/api/monolith-edge","packages/edge/api/monolith-public","packages/edge/api/traefik-provider","packages/edge/infra/client/actor-kv","packages/edge/infra/client/config","packages/edge/infra/client/container-runner","packages/edge/infra/client/echo","packages/edge/infra/client/isolate-v8-runner","packages/edge/infra/client/manager","packages/edge/infra/edge-server","packages/edge/infra/guard/core","packages/edge/infra/guard/server","packages/edge/services/monolith/standalone/workflow-worker","packages/edge/services/pegboard","packages/edge/services/pegboard/standalone/usage-metrics-publish","packages/edge/services/pegboard/standalone/ws","packages/toolchain/cli","packages/toolchain/js-utils-embed","packages/toolchain/toolchain","sdks/api/full/rust"] [workspace.package] version = "25.5.3" From 7a40790e7ced8de508603c5cc8702b792515fa5b Mon Sep 17 00:00:00 2001 From: MasterPtato Date: Thu, 8 May 2025 01:51:25 +0000 Subject: [PATCH 3/3] feat: add runners to pb protocol --- Cargo.toml | 5 +- docker/dev-full/docker-compose.yml | 3 +- examples/system-test-actor/tests/client.ts | 7 + packages/common/fdb-util/src/keys.rs | 11 + packages/common/util/core/src/serde.rs | 12 +- packages/core/api/actor/src/route/builds.rs | 2 + .../20250508204859_alloc_type.down.sql | 0 .../20250508204859_alloc_type.up.sql | 3 + .../core/services/build/src/ops/create.rs | 12 +- packages/core/services/build/src/ops/get.rs | 8 + .../build/src/ops/resolve_for_tags.rs | 4 +- packages/core/services/build/src/types.rs | 10 + .../files/pegboard_configure.sh | 10 +- packages/edge/api/actor/src/route/actors.rs | 286 +++-- packages/edge/infra/client/config/Cargo.toml | 1 + .../client/config/src/isolate_runner/actor.rs | 28 - .../client/config/src/isolate_runner/mod.rs | 13 - packages/edge/infra/client/config/src/lib.rs | 2 - .../edge/infra/client/config/src/manager.rs | 9 - .../client/config/src/runner_protocol.rs | 34 +- .../edge/infra/client/config/src/utils.rs | 11 - .../client/container-runner/src/container.rs | 12 +- .../container-runner/src/log_shipper.rs | 12 +- .../infra/client/container-runner/src/main.rs | 16 +- packages/edge/infra/client/echo/Cargo.toml | 9 +- packages/edge/infra/client/echo/src/main.rs | 163 ++- .../infra/client/isolate-v8-runner/Cargo.toml | 47 - .../infra/client/isolate-v8-runner/Dockerfile | 29 - .../isolate-v8-runner/Dockerfile.dockerignore | 8 - .../infra/client/isolate-v8-runner/README.md | 11 - .../isolate-v8-runner/js/40_rivet_kv.js | 270 ----- .../isolate-v8-runner/js/90_rivet_ns.js | 29 - .../js/lib/fast-equals/comparator.js | 220 ---- .../js/lib/fast-equals/equals.js | 201 ---- .../js/lib/fast-equals/index.js | 74 -- .../js/lib/fast-equals/internalTypes.js | 5 - .../js/lib/fast-equals/utils.js | 56 - .../client/isolate-v8-runner/src/ext/kv.rs | 196 --- .../client/isolate-v8-runner/src/ext/mod.rs | 2 - .../isolate-v8-runner/src/ext/runtime.rs | 11 - .../client/isolate-v8-runner/src/isolate.rs | 648 ---------- .../isolate-v8-runner/src/log_shipper.rs | 292 ----- .../client/isolate-v8-runner/src/metadata.rs | 185 --- .../client/isolate-v8-runner/src/throttle.rs | 55 - .../client/isolate-v8-runner/src/utils.rs | 162 --- .../client/isolate-v8-runner/tests/index.js | 24 - packages/edge/infra/client/manager/Cargo.toml | 1 + .../infra/client/manager/src/actor/mod.rs | 476 +++----- packages/edge/infra/client/manager/src/ctx.rs | 538 ++++----- packages/edge/infra/client/manager/src/lib.rs | 2 - .../edge/infra/client/manager/src/main.rs | 4 +- .../infra/client/manager/src/metrics/mod.rs | 35 - .../edge/infra/client/manager/src/runner.rs | 391 ------ .../infra/client/manager/src/runner/mod.rs | 666 ++++++++++ .../src/{actor => runner}/oci_config.rs | 18 +- .../{actor => runner}/partial_oci_config.rs | 0 .../manager/src/{actor => runner}/seccomp.rs | 2 +- .../manager/src/{actor => runner}/setup.rs | 599 ++++----- .../infra/client/manager/src/utils/mod.rs | 72 +- .../edge/infra/client/manager/tests/common.rs | 160 +-- .../edge/infra/client/manager/tests/index.js | 23 - .../client/manager/tests/isolate_lifecycle.rs | 216 ---- .../infra/client/manager/tests/vector.json | 10 +- .../services/pegboard/src/keys/datacenter.rs | 119 ++ .../edge/services/pegboard/src/keys/mod.rs | 1 + .../edge/services/pegboard/src/keys/runner.rs | 140 +++ packages/edge/services/pegboard/src/lib.rs | 1 + .../edge/services/pegboard/src/protocol.rs | 63 +- .../pegboard/src/workflows/actor/runtime.rs | 5 +- .../pegboard/src/workflows/actor2/destroy.rs | 467 ++++++++ .../src/workflows/actor2/migrations.rs | 78 ++ .../pegboard/src/workflows/actor2/mod.rs | 513 ++++++++ .../pegboard/src/workflows/actor2/runtime.rs | 1067 +++++++++++++++++ .../pegboard/src/workflows/actor2/setup.rs | 728 +++++++++++ .../pegboard/src/workflows/client/mod.rs | 3 + .../services/pegboard/src/workflows/mod.rs | 1 + 76 files changed, 5123 insertions(+), 4484 deletions(-) create mode 100644 packages/core/services/build/db/build/migrations/20250508204859_alloc_type.down.sql create mode 100644 packages/core/services/build/db/build/migrations/20250508204859_alloc_type.up.sql delete mode 100644 packages/edge/infra/client/config/src/isolate_runner/actor.rs delete mode 100644 packages/edge/infra/client/config/src/isolate_runner/mod.rs delete mode 100644 packages/edge/infra/client/config/src/utils.rs delete mode 100644 packages/edge/infra/client/isolate-v8-runner/Cargo.toml delete mode 100644 packages/edge/infra/client/isolate-v8-runner/Dockerfile delete mode 100644 packages/edge/infra/client/isolate-v8-runner/Dockerfile.dockerignore delete mode 100644 packages/edge/infra/client/isolate-v8-runner/README.md delete mode 100644 packages/edge/infra/client/isolate-v8-runner/js/40_rivet_kv.js delete mode 100644 packages/edge/infra/client/isolate-v8-runner/js/90_rivet_ns.js delete mode 100644 packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/comparator.js delete mode 100644 packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/equals.js delete mode 100644 packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/index.js delete mode 100644 packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/internalTypes.js delete mode 100644 packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/utils.js delete mode 100644 packages/edge/infra/client/isolate-v8-runner/src/ext/kv.rs delete mode 100644 packages/edge/infra/client/isolate-v8-runner/src/ext/mod.rs delete mode 100644 packages/edge/infra/client/isolate-v8-runner/src/ext/runtime.rs delete mode 100644 packages/edge/infra/client/isolate-v8-runner/src/isolate.rs delete mode 100644 packages/edge/infra/client/isolate-v8-runner/src/log_shipper.rs delete mode 100644 packages/edge/infra/client/isolate-v8-runner/src/metadata.rs delete mode 100644 packages/edge/infra/client/isolate-v8-runner/src/throttle.rs delete mode 100644 packages/edge/infra/client/isolate-v8-runner/src/utils.rs delete mode 100644 packages/edge/infra/client/isolate-v8-runner/tests/index.js delete mode 100644 packages/edge/infra/client/manager/src/runner.rs create mode 100644 packages/edge/infra/client/manager/src/runner/mod.rs rename packages/edge/infra/client/manager/src/{actor => runner}/oci_config.rs (92%) rename packages/edge/infra/client/manager/src/{actor => runner}/partial_oci_config.rs (100%) rename packages/edge/infra/client/manager/src/{actor => runner}/seccomp.rs (99%) rename packages/edge/infra/client/manager/src/{actor => runner}/setup.rs (59%) delete mode 100644 packages/edge/infra/client/manager/tests/index.js delete mode 100644 packages/edge/infra/client/manager/tests/isolate_lifecycle.rs create mode 100644 packages/edge/services/pegboard/src/keys/runner.rs create mode 100644 packages/edge/services/pegboard/src/workflows/actor2/destroy.rs create mode 100644 packages/edge/services/pegboard/src/workflows/actor2/migrations.rs create mode 100644 packages/edge/services/pegboard/src/workflows/actor2/mod.rs create mode 100644 packages/edge/services/pegboard/src/workflows/actor2/runtime.rs create mode 100644 packages/edge/services/pegboard/src/workflows/actor2/setup.rs diff --git a/Cargo.toml b/Cargo.toml index 920901a689..9b68f615e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [workspace] resolver = "2" -members = ["packages/common/api-helper/build","packages/common/api-helper/macros","packages/common/cache/build","packages/common/cache/result","packages/common/chirp-workflow/core","packages/common/chirp-workflow/macros","packages/common/chirp/client","packages/common/chirp/metrics","packages/common/chirp/perf","packages/common/chirp/types","packages/common/chirp/worker","packages/common/chirp/worker-attributes","packages/common/claims","packages/common/clickhouse-inserter","packages/common/clickhouse-user-query","packages/common/config","packages/common/connection","packages/common/convert","packages/common/deno-embed","packages/common/env","packages/common/fdb-util","packages/common/formatted-error","packages/common/global-error","packages/common/health-checks","packages/common/hub-embed","packages/common/kv-str","packages/common/logs","packages/common/metrics","packages/common/migrate","packages/common/nomad-util","packages/common/operation/core","packages/common/operation/macros","packages/common/pools","packages/common/redis-util","packages/common/runtime","packages/common/s3-util","packages/common/schemac","packages/common/server-cli","packages/common/service-discovery","packages/common/service-manager","packages/common/smithy-output/api-auth/rust","packages/common/smithy-output/api-auth/rust-server","packages/common/smithy-output/api-cf-verification/rust","packages/common/smithy-output/api-cf-verification/rust-server","packages/common/smithy-output/api-cloud/rust","packages/common/smithy-output/api-cloud/rust-server","packages/common/smithy-output/api-group/rust","packages/common/smithy-output/api-group/rust-server","packages/common/smithy-output/api-identity/rust","packages/common/smithy-output/api-identity/rust-server","packages/common/smithy-output/api-job/rust","packages/common/smithy-output/api-job/rust-server","packages/common/smithy-output/api-kv/rust","packages/common/smithy-output/api-kv/rust-server","packages/common/smithy-output/api-matchmaker/rust","packages/common/smithy-output/api-matchmaker/rust-server","packages/common/smithy-output/api-party/rust","packages/common/smithy-output/api-party/rust-server","packages/common/smithy-output/api-portal/rust","packages/common/smithy-output/api-portal/rust-server","packages/common/smithy-output/api-status/rust","packages/common/smithy-output/api-status/rust-server","packages/common/smithy-output/api-traefik-provider/rust","packages/common/smithy-output/api-traefik-provider/rust-server","packages/common/test","packages/common/test-images","packages/common/types-proto/build","packages/common/types-proto/core","packages/common/util/core","packages/common/util/macros","packages/common/util/search","packages/core/api/actor","packages/core/api/auth","packages/core/api/cf-verification","packages/core/api/cloud","packages/core/api/games","packages/core/api/group","packages/core/api/identity","packages/core/api/intercom","packages/core/api/job","packages/core/api/matchmaker","packages/core/api/monolith-edge","packages/core/api/monolith-public","packages/core/api/portal","packages/core/api/provision","packages/core/api/status","packages/core/api/traefik-provider","packages/core/api/ui","packages/core/infra/legacy/job-runner","packages/core/infra/schema-generator","packages/core/infra/server","packages/core/services/build","packages/core/services/build/ops/create","packages/core/services/build/ops/get","packages/core/services/build/ops/list-for-env","packages/core/services/build/ops/list-for-game","packages/core/services/build/standalone/default-create","packages/core/services/build/util","packages/core/services/captcha/ops/hcaptcha-config-get","packages/core/services/captcha/ops/hcaptcha-verify","packages/core/services/captcha/ops/request","packages/core/services/captcha/ops/turnstile-config-get","packages/core/services/captcha/ops/turnstile-verify","packages/core/services/captcha/ops/verify","packages/core/services/captcha/util","packages/core/services/cdn/ops/namespace-auth-user-remove","packages/core/services/cdn/ops/namespace-auth-user-update","packages/core/services/cdn/ops/namespace-create","packages/core/services/cdn/ops/namespace-domain-create","packages/core/services/cdn/ops/namespace-domain-remove","packages/core/services/cdn/ops/namespace-get","packages/core/services/cdn/ops/namespace-resolve-domain","packages/core/services/cdn/ops/ns-auth-type-set","packages/core/services/cdn/ops/ns-enable-domain-public-auth-set","packages/core/services/cdn/ops/site-create","packages/core/services/cdn/ops/site-get","packages/core/services/cdn/ops/site-list-for-game","packages/core/services/cdn/ops/version-get","packages/core/services/cdn/ops/version-prepare","packages/core/services/cdn/ops/version-publish","packages/core/services/cdn/util","packages/core/services/cdn/worker","packages/core/services/cf-custom-hostname/ops/get","packages/core/services/cf-custom-hostname/ops/list-for-namespace-id","packages/core/services/cf-custom-hostname/ops/resolve-hostname","packages/core/services/cf-custom-hostname/worker","packages/core/services/cloud/ops/device-link-create","packages/core/services/cloud/ops/game-config-create","packages/core/services/cloud/ops/game-config-get","packages/core/services/cloud/ops/game-token-create","packages/core/services/cloud/ops/namespace-create","packages/core/services/cloud/ops/namespace-get","packages/core/services/cloud/ops/namespace-token-development-create","packages/core/services/cloud/ops/namespace-token-public-create","packages/core/services/cloud/ops/version-get","packages/core/services/cloud/ops/version-publish","packages/core/services/cloud/standalone/default-create","packages/core/services/cloud/worker","packages/core/services/cluster","packages/core/services/cluster/standalone/datacenter-tls-renew","packages/core/services/cluster/standalone/default-update","packages/core/services/cluster/standalone/gc","packages/core/services/cluster/standalone/metrics-publish","packages/core/services/custom-user-avatar/ops/list-for-game","packages/core/services/custom-user-avatar/ops/upload-complete","packages/core/services/debug/ops/email-res","packages/core/services/dynamic-config","packages/core/services/email-verification/ops/complete","packages/core/services/email-verification/ops/create","packages/core/services/email/ops/send","packages/core/services/external/ops/request-validate","packages/core/services/external/worker","packages/core/services/faker/ops/build","packages/core/services/faker/ops/cdn-site","packages/core/services/faker/ops/game","packages/core/services/faker/ops/game-namespace","packages/core/services/faker/ops/game-version","packages/core/services/faker/ops/job-run","packages/core/services/faker/ops/job-template","packages/core/services/faker/ops/mm-lobby","packages/core/services/faker/ops/mm-lobby-row","packages/core/services/faker/ops/mm-player","packages/core/services/faker/ops/region","packages/core/services/faker/ops/team","packages/core/services/faker/ops/user","packages/core/services/game/ops/banner-upload-complete","packages/core/services/game/ops/create","packages/core/services/game/ops/get","packages/core/services/game/ops/list-all","packages/core/services/game/ops/list-for-team","packages/core/services/game/ops/logo-upload-complete","packages/core/services/game/ops/namespace-create","packages/core/services/game/ops/namespace-get","packages/core/services/game/ops/namespace-list","packages/core/services/game/ops/namespace-resolve-name-id","packages/core/services/game/ops/namespace-resolve-url","packages/core/services/game/ops/namespace-validate","packages/core/services/game/ops/namespace-version-history-list","packages/core/services/game/ops/namespace-version-set","packages/core/services/game/ops/recommend","packages/core/services/game/ops/resolve-name-id","packages/core/services/game/ops/resolve-namespace-id","packages/core/services/game/ops/token-development-validate","packages/core/services/game/ops/validate","packages/core/services/game/ops/version-create","packages/core/services/game/ops/version-get","packages/core/services/game/ops/version-list","packages/core/services/game/ops/version-validate","packages/core/services/ip/ops/info","packages/core/services/job-log/ops/read","packages/core/services/job-log/worker","packages/core/services/job-run","packages/core/services/job/standalone/gc","packages/core/services/job/util","packages/core/services/linode","packages/core/services/linode/standalone/gc","packages/core/services/load-test/standalone/api-cloud","packages/core/services/load-test/standalone/mm","packages/core/services/load-test/standalone/mm-sustain","packages/core/services/load-test/standalone/sqlx","packages/core/services/load-test/standalone/watch-requests","packages/core/services/mm-config/ops/game-get","packages/core/services/mm-config/ops/game-upsert","packages/core/services/mm-config/ops/lobby-group-get","packages/core/services/mm-config/ops/lobby-group-resolve-name-id","packages/core/services/mm-config/ops/lobby-group-resolve-version","packages/core/services/mm-config/ops/namespace-config-set","packages/core/services/mm-config/ops/namespace-config-validate","packages/core/services/mm-config/ops/namespace-create","packages/core/services/mm-config/ops/namespace-get","packages/core/services/mm-config/ops/version-get","packages/core/services/mm-config/ops/version-prepare","packages/core/services/mm-config/ops/version-publish","packages/core/services/mm/ops/dev-player-token-create","packages/core/services/mm/ops/lobby-find-fail","packages/core/services/mm/ops/lobby-find-lobby-query-list","packages/core/services/mm/ops/lobby-find-try-complete","packages/core/services/mm/ops/lobby-for-run-id","packages/core/services/mm/ops/lobby-get","packages/core/services/mm/ops/lobby-history","packages/core/services/mm/ops/lobby-idle-update","packages/core/services/mm/ops/lobby-list-for-namespace","packages/core/services/mm/ops/lobby-list-for-user-id","packages/core/services/mm/ops/lobby-player-count","packages/core/services/mm/ops/lobby-runtime-aggregate","packages/core/services/mm/ops/lobby-state-get","packages/core/services/mm/ops/player-count-for-namespace","packages/core/services/mm/ops/player-get","packages/core/services/mm/standalone/gc","packages/core/services/mm/util","packages/core/services/mm/worker","packages/core/services/monolith/standalone/worker","packages/core/services/monolith/standalone/workflow-worker","packages/core/services/nomad/standalone/monitor","packages/core/services/region/ops/get","packages/core/services/region/ops/list","packages/core/services/region/ops/list-for-game","packages/core/services/region/ops/recommend","packages/core/services/region/ops/resolve","packages/core/services/region/ops/resolve-for-game","packages/core/services/route","packages/core/services/server-spec","packages/core/services/team-invite/ops/get","packages/core/services/team-invite/worker","packages/core/services/team/ops/avatar-upload-complete","packages/core/services/team/ops/get","packages/core/services/team/ops/join-request-list","packages/core/services/team/ops/member-count","packages/core/services/team/ops/member-get","packages/core/services/team/ops/member-list","packages/core/services/team/ops/member-relationship-get","packages/core/services/team/ops/profile-validate","packages/core/services/team/ops/recommend","packages/core/services/team/ops/resolve-display-name","packages/core/services/team/ops/user-ban-get","packages/core/services/team/ops/user-ban-list","packages/core/services/team/ops/validate","packages/core/services/team/util","packages/core/services/team/worker","packages/core/services/telemetry/standalone/beacon","packages/core/services/tier","packages/core/services/token/ops/create","packages/core/services/token/ops/exchange","packages/core/services/token/ops/get","packages/core/services/token/ops/revoke","packages/core/services/upload/ops/complete","packages/core/services/upload/ops/file-list","packages/core/services/upload/ops/get","packages/core/services/upload/ops/list-for-user","packages/core/services/upload/ops/prepare","packages/core/services/upload/worker","packages/core/services/user","packages/core/services/user-identity/ops/create","packages/core/services/user-identity/ops/delete","packages/core/services/user-identity/ops/get","packages/core/services/user/ops/avatar-upload-complete","packages/core/services/user/ops/get","packages/core/services/user/ops/pending-delete-toggle","packages/core/services/user/ops/profile-validate","packages/core/services/user/ops/resolve-email","packages/core/services/user/ops/team-list","packages/core/services/user/ops/token-create","packages/core/services/user/standalone/delete-pending","packages/core/services/user/worker","packages/edge/api/actor","packages/edge/api/intercom","packages/edge/api/monolith-edge","packages/edge/api/monolith-public","packages/edge/api/traefik-provider","packages/edge/infra/client/actor-kv","packages/edge/infra/client/config","packages/edge/infra/client/container-runner","packages/edge/infra/client/echo","packages/edge/infra/client/isolate-v8-runner","packages/edge/infra/client/manager","packages/edge/infra/edge-server","packages/edge/infra/guard/core","packages/edge/infra/guard/server","packages/edge/services/monolith/standalone/workflow-worker","packages/edge/services/pegboard","packages/edge/services/pegboard/standalone/usage-metrics-publish","packages/edge/services/pegboard/standalone/ws","packages/toolchain/cli","packages/toolchain/js-utils-embed","packages/toolchain/toolchain","sdks/api/full/rust"] +members = ["packages/common/api-helper/build","packages/common/api-helper/macros","packages/common/cache/build","packages/common/cache/result","packages/common/chirp-workflow/core","packages/common/chirp-workflow/macros","packages/common/chirp/client","packages/common/chirp/metrics","packages/common/chirp/perf","packages/common/chirp/types","packages/common/chirp/worker","packages/common/chirp/worker-attributes","packages/common/claims","packages/common/config","packages/common/connection","packages/common/convert","packages/common/deno-embed","packages/common/env","packages/common/fdb-util","packages/common/formatted-error","packages/common/global-error","packages/common/health-checks","packages/common/hub-embed","packages/common/kv-str","packages/common/logs","packages/common/metrics","packages/common/migrate","packages/common/nomad-util","packages/common/operation/core","packages/common/operation/macros","packages/common/pools","packages/common/redis-util","packages/common/runtime","packages/common/s3-util","packages/common/schemac","packages/common/server-cli","packages/common/service-discovery","packages/common/service-manager","packages/common/smithy-output/api-auth/rust","packages/common/smithy-output/api-auth/rust-server","packages/common/smithy-output/api-cf-verification/rust","packages/common/smithy-output/api-cf-verification/rust-server","packages/common/smithy-output/api-cloud/rust","packages/common/smithy-output/api-cloud/rust-server","packages/common/smithy-output/api-group/rust","packages/common/smithy-output/api-group/rust-server","packages/common/smithy-output/api-identity/rust","packages/common/smithy-output/api-identity/rust-server","packages/common/smithy-output/api-job/rust","packages/common/smithy-output/api-job/rust-server","packages/common/smithy-output/api-kv/rust","packages/common/smithy-output/api-kv/rust-server","packages/common/smithy-output/api-matchmaker/rust","packages/common/smithy-output/api-matchmaker/rust-server","packages/common/smithy-output/api-party/rust","packages/common/smithy-output/api-party/rust-server","packages/common/smithy-output/api-portal/rust","packages/common/smithy-output/api-portal/rust-server","packages/common/smithy-output/api-status/rust","packages/common/smithy-output/api-status/rust-server","packages/common/smithy-output/api-traefik-provider/rust","packages/common/smithy-output/api-traefik-provider/rust-server","packages/common/test","packages/common/test-images","packages/common/types-proto/build","packages/common/types-proto/core","packages/common/util/core","packages/common/util/macros","packages/common/util/search","packages/core/api/actor","packages/core/api/auth","packages/core/api/cf-verification","packages/core/api/cloud","packages/core/api/games","packages/core/api/group","packages/core/api/identity","packages/core/api/intercom","packages/core/api/job","packages/core/api/matchmaker","packages/core/api/monolith-edge","packages/core/api/monolith-public","packages/core/api/portal","packages/core/api/provision","packages/core/api/status","packages/core/api/traefik-provider","packages/core/api/ui","packages/core/infra/legacy/job-runner","packages/core/infra/schema-generator","packages/core/infra/server","packages/core/services/build","packages/core/services/build/ops/create","packages/core/services/build/ops/get","packages/core/services/build/ops/list-for-env","packages/core/services/build/ops/list-for-game","packages/core/services/build/standalone/default-create","packages/core/services/build/util","packages/core/services/captcha/ops/hcaptcha-config-get","packages/core/services/captcha/ops/hcaptcha-verify","packages/core/services/captcha/ops/request","packages/core/services/captcha/ops/turnstile-config-get","packages/core/services/captcha/ops/turnstile-verify","packages/core/services/captcha/ops/verify","packages/core/services/captcha/util","packages/core/services/cdn/ops/namespace-auth-user-remove","packages/core/services/cdn/ops/namespace-auth-user-update","packages/core/services/cdn/ops/namespace-create","packages/core/services/cdn/ops/namespace-domain-create","packages/core/services/cdn/ops/namespace-domain-remove","packages/core/services/cdn/ops/namespace-get","packages/core/services/cdn/ops/namespace-resolve-domain","packages/core/services/cdn/ops/ns-auth-type-set","packages/core/services/cdn/ops/ns-enable-domain-public-auth-set","packages/core/services/cdn/ops/site-create","packages/core/services/cdn/ops/site-get","packages/core/services/cdn/ops/site-list-for-game","packages/core/services/cdn/ops/version-get","packages/core/services/cdn/ops/version-prepare","packages/core/services/cdn/ops/version-publish","packages/core/services/cdn/util","packages/core/services/cdn/worker","packages/core/services/cf-custom-hostname/ops/get","packages/core/services/cf-custom-hostname/ops/list-for-namespace-id","packages/core/services/cf-custom-hostname/ops/resolve-hostname","packages/core/services/cf-custom-hostname/worker","packages/core/services/cloud/ops/device-link-create","packages/core/services/cloud/ops/game-config-create","packages/core/services/cloud/ops/game-config-get","packages/core/services/cloud/ops/game-token-create","packages/core/services/cloud/ops/namespace-create","packages/core/services/cloud/ops/namespace-get","packages/core/services/cloud/ops/namespace-token-development-create","packages/core/services/cloud/ops/namespace-token-public-create","packages/core/services/cloud/ops/version-get","packages/core/services/cloud/ops/version-publish","packages/core/services/cloud/standalone/default-create","packages/core/services/cloud/worker","packages/core/services/cluster","packages/core/services/cluster/standalone/datacenter-tls-renew","packages/core/services/cluster/standalone/default-update","packages/core/services/cluster/standalone/gc","packages/core/services/cluster/standalone/metrics-publish","packages/core/services/custom-user-avatar/ops/list-for-game","packages/core/services/custom-user-avatar/ops/upload-complete","packages/core/services/debug/ops/email-res","packages/core/services/dynamic-config","packages/core/services/email-verification/ops/complete","packages/core/services/email-verification/ops/create","packages/core/services/email/ops/send","packages/core/services/external/ops/request-validate","packages/core/services/external/worker","packages/core/services/faker/ops/build","packages/core/services/faker/ops/cdn-site","packages/core/services/faker/ops/game","packages/core/services/faker/ops/game-namespace","packages/core/services/faker/ops/game-version","packages/core/services/faker/ops/job-run","packages/core/services/faker/ops/job-template","packages/core/services/faker/ops/mm-lobby","packages/core/services/faker/ops/mm-lobby-row","packages/core/services/faker/ops/mm-player","packages/core/services/faker/ops/region","packages/core/services/faker/ops/team","packages/core/services/faker/ops/user","packages/core/services/game/ops/banner-upload-complete","packages/core/services/game/ops/create","packages/core/services/game/ops/get","packages/core/services/game/ops/list-all","packages/core/services/game/ops/list-for-team","packages/core/services/game/ops/logo-upload-complete","packages/core/services/game/ops/namespace-create","packages/core/services/game/ops/namespace-get","packages/core/services/game/ops/namespace-list","packages/core/services/game/ops/namespace-resolve-name-id","packages/core/services/game/ops/namespace-resolve-url","packages/core/services/game/ops/namespace-validate","packages/core/services/game/ops/namespace-version-history-list","packages/core/services/game/ops/namespace-version-set","packages/core/services/game/ops/recommend","packages/core/services/game/ops/resolve-name-id","packages/core/services/game/ops/resolve-namespace-id","packages/core/services/game/ops/token-development-validate","packages/core/services/game/ops/validate","packages/core/services/game/ops/version-create","packages/core/services/game/ops/version-get","packages/core/services/game/ops/version-list","packages/core/services/game/ops/version-validate","packages/core/services/ip/ops/info","packages/core/services/job-log/ops/read","packages/core/services/job-log/worker","packages/core/services/job-run","packages/core/services/job/standalone/gc","packages/core/services/job/util","packages/core/services/linode","packages/core/services/linode/standalone/gc","packages/core/services/load-test/standalone/api-cloud","packages/core/services/load-test/standalone/mm","packages/core/services/load-test/standalone/mm-sustain","packages/core/services/load-test/standalone/sqlx","packages/core/services/load-test/standalone/watch-requests","packages/core/services/mm-config/ops/game-get","packages/core/services/mm-config/ops/game-upsert","packages/core/services/mm-config/ops/lobby-group-get","packages/core/services/mm-config/ops/lobby-group-resolve-name-id","packages/core/services/mm-config/ops/lobby-group-resolve-version","packages/core/services/mm-config/ops/namespace-config-set","packages/core/services/mm-config/ops/namespace-config-validate","packages/core/services/mm-config/ops/namespace-create","packages/core/services/mm-config/ops/namespace-get","packages/core/services/mm-config/ops/version-get","packages/core/services/mm-config/ops/version-prepare","packages/core/services/mm-config/ops/version-publish","packages/core/services/mm/ops/dev-player-token-create","packages/core/services/mm/ops/lobby-find-fail","packages/core/services/mm/ops/lobby-find-lobby-query-list","packages/core/services/mm/ops/lobby-find-try-complete","packages/core/services/mm/ops/lobby-for-run-id","packages/core/services/mm/ops/lobby-get","packages/core/services/mm/ops/lobby-history","packages/core/services/mm/ops/lobby-idle-update","packages/core/services/mm/ops/lobby-list-for-namespace","packages/core/services/mm/ops/lobby-list-for-user-id","packages/core/services/mm/ops/lobby-player-count","packages/core/services/mm/ops/lobby-runtime-aggregate","packages/core/services/mm/ops/lobby-state-get","packages/core/services/mm/ops/player-count-for-namespace","packages/core/services/mm/ops/player-get","packages/core/services/mm/standalone/gc","packages/core/services/mm/util","packages/core/services/mm/worker","packages/core/services/monolith/standalone/worker","packages/core/services/monolith/standalone/workflow-worker","packages/core/services/nomad/standalone/monitor","packages/core/services/region/ops/get","packages/core/services/region/ops/list","packages/core/services/region/ops/list-for-game","packages/core/services/region/ops/recommend","packages/core/services/region/ops/resolve","packages/core/services/region/ops/resolve-for-game","packages/core/services/route","packages/core/services/server-spec","packages/core/services/team-invite/ops/get","packages/core/services/team-invite/worker","packages/core/services/team/ops/avatar-upload-complete","packages/core/services/team/ops/get","packages/core/services/team/ops/join-request-list","packages/core/services/team/ops/member-count","packages/core/services/team/ops/member-get","packages/core/services/team/ops/member-list","packages/core/services/team/ops/member-relationship-get","packages/core/services/team/ops/profile-validate","packages/core/services/team/ops/recommend","packages/core/services/team/ops/resolve-display-name","packages/core/services/team/ops/user-ban-get","packages/core/services/team/ops/user-ban-list","packages/core/services/team/ops/validate","packages/core/services/team/util","packages/core/services/team/worker","packages/core/services/telemetry/standalone/beacon","packages/core/services/tier","packages/core/services/token/ops/create","packages/core/services/token/ops/exchange","packages/core/services/token/ops/get","packages/core/services/token/ops/revoke","packages/core/services/upload/ops/complete","packages/core/services/upload/ops/file-list","packages/core/services/upload/ops/get","packages/core/services/upload/ops/list-for-user","packages/core/services/upload/ops/prepare","packages/core/services/upload/worker","packages/core/services/user","packages/core/services/user-identity/ops/create","packages/core/services/user-identity/ops/delete","packages/core/services/user-identity/ops/get","packages/core/services/user/ops/avatar-upload-complete","packages/core/services/user/ops/get","packages/core/services/user/ops/pending-delete-toggle","packages/core/services/user/ops/profile-validate","packages/core/services/user/ops/resolve-email","packages/core/services/user/ops/team-list","packages/core/services/user/ops/token-create","packages/core/services/user/standalone/delete-pending","packages/core/services/user/worker","packages/edge/api/actor","packages/edge/api/intercom","packages/edge/api/monolith-edge","packages/edge/api/monolith-public","packages/edge/api/traefik-provider","packages/edge/infra/client/actor-kv","packages/edge/infra/client/config","packages/edge/infra/client/container-runner","packages/edge/infra/client/echo","packages/edge/infra/client/manager","packages/edge/infra/edge-server","packages/edge/infra/guard/core","packages/edge/infra/guard/server","packages/edge/services/monolith/standalone/workflow-worker","packages/edge/services/pegboard","packages/edge/services/pegboard/standalone/ws","packages/toolchain/cli","packages/toolchain/js-utils-embed","packages/toolchain/toolchain","sdks/api/full/rust"] [workspace.package] version = "25.5.3" @@ -992,9 +992,6 @@ path = "packages/edge/infra/client/container-runner" [workspace.dependencies.pegboard-echo-server] path = "packages/edge/infra/client/echo" -[workspace.dependencies.pegboard-isolate-v8-runner] -path = "packages/edge/infra/client/isolate-v8-runner" - [workspace.dependencies.pegboard-manager] path = "packages/edge/infra/client/manager" diff --git a/docker/dev-full/docker-compose.yml b/docker/dev-full/docker-compose.yml index 04c6a77892..c9b261ee36 100644 --- a/docker/dev-full/docker-compose.yml +++ b/docker/dev-full/docker-compose.yml @@ -133,14 +133,13 @@ services: restart: unless-stopped command: /usr/bin/rivet-guard environment: - - RUST_LOG=debug + # - RUST_LOG=debug - RUST_BACKTRACE=1 - RUST_LOG_ANSI_COLOR=1 - RIVET_OTEL_ENABLED=1 - RIVET_OTEL_SAMPLER_RATIO=1 - RIVET_SERVICE_NAME=rivet-guard - RIVET_OTEL_ENDPOINT=http://otel-collector:4317 - - RUST_LOG=debug,hyper=info stop_grace_period: 0s ports: # HTTP diff --git a/examples/system-test-actor/tests/client.ts b/examples/system-test-actor/tests/client.ts index 7521330a31..d3d3f6269b 100644 --- a/examples/system-test-actor/tests/client.ts +++ b/examples/system-test-actor/tests/client.ts @@ -49,6 +49,13 @@ async function run() { guard: {}, }, }, + http2: { + protocol: "http", + internalPort: 8085, + routing: { + guard: {}, + }, + }, udp: { protocol: "udp", // internalPort: 80, diff --git a/packages/common/fdb-util/src/keys.rs b/packages/common/fdb-util/src/keys.rs index 4c62fab615..8db9112ec0 100644 --- a/packages/common/fdb-util/src/keys.rs +++ b/packages/common/fdb-util/src/keys.rs @@ -46,6 +46,11 @@ pub const SQLITE: usize = 44; pub const INTERNAL: usize = 45; pub const METADATA: usize = 46; pub const COMPRESSED_DATA: usize = 47; +pub const RUNNER: usize = 48; +pub const RUNNERS_BY_REMAINING_SLOTS: usize = 49; +pub const REMAINING_SLOTS: usize = 50; +pub const TOTAL_SLOTS: usize = 51; +pub const IMAGE_ID: usize = 52; // Directories with fdbrs must use string paths instead of tuples pub mod dir { @@ -103,6 +108,12 @@ pub fn key_from_str(key: &str) -> Option { "sqlite" => Some(SQLITE), "internal" => Some(INTERNAL), "metadata" => Some(METADATA), + "compressed_data" => Some(COMPRESSED_DATA), + "runner" => Some(RUNNER), + "runners_by_remaining_slots" => Some(RUNNERS_BY_REMAINING_SLOTS), + "remaining_slots" => Some(REMAINING_SLOTS), + "total_slots" => Some(TOTAL_SLOTS), + "image_id" => Some(IMAGE_ID), _ => None, } } diff --git a/packages/common/util/core/src/serde.rs b/packages/common/util/core/src/serde.rs index 1de9bfd9b2..581cadcc54 100644 --- a/packages/common/util/core/src/serde.rs +++ b/packages/common/util/core/src/serde.rs @@ -3,7 +3,7 @@ use std::{ fmt, hash::{Hash, Hasher}, marker::PhantomData, - ops::Deref, + ops::{Deref, DerefMut}, }; use indexmap::IndexMap; @@ -125,6 +125,10 @@ impl HashableMap { pub fn new() -> Self { HashableMap(IndexMap::new()) } + + pub fn with_capacity(capacity: usize) -> Self { + HashableMap(IndexMap::with_capacity(capacity)) + } } impl Default for HashableMap { @@ -141,6 +145,12 @@ impl Deref for HashableMap { } } +impl DerefMut for HashableMap { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl Hash for HashableMap { fn hash(&self, state: &mut H) { let mut kv = Vec::from_iter(&self.0); diff --git a/packages/core/api/actor/src/route/builds.rs b/packages/core/api/actor/src/route/builds.rs index e4c8e6935d..90c230558f 100644 --- a/packages/core/api/actor/src/route/builds.rs +++ b/packages/core/api/actor/src/route/builds.rs @@ -361,6 +361,8 @@ pub async fn create_build( .compression .map(ApiInto::api_into) .unwrap_or(build::types::BuildCompression::None), + allocation_type: build::types::BuildAllocationType::Single, + allocation_total_slots: 1, }) .await?; diff --git a/packages/core/services/build/db/build/migrations/20250508204859_alloc_type.down.sql b/packages/core/services/build/db/build/migrations/20250508204859_alloc_type.down.sql new file mode 100644 index 0000000000..e69de29bb2 diff --git a/packages/core/services/build/db/build/migrations/20250508204859_alloc_type.up.sql b/packages/core/services/build/db/build/migrations/20250508204859_alloc_type.up.sql new file mode 100644 index 0000000000..4f809054a0 --- /dev/null +++ b/packages/core/services/build/db/build/migrations/20250508204859_alloc_type.up.sql @@ -0,0 +1,3 @@ +ALTER TABLE builds + ADD allocation_type INT NOT NULL DEFAULT 0, + ADD allocation_total_slots INT NOT NULL DEFAULT 1; diff --git a/packages/core/services/build/src/ops/create.rs b/packages/core/services/build/src/ops/create.rs index 50d2a6cf35..b384dc01cd 100644 --- a/packages/core/services/build/src/ops/create.rs +++ b/packages/core/services/build/src/ops/create.rs @@ -4,7 +4,7 @@ use rivet_operation::prelude::proto::backend; const MAX_UPLOAD_SIZE: u64 = util::file_size::gigabytes(8); const MAX_JS_BUILD_UPLOAD_SIZE: u64 = util::file_size::megabytes(10); use crate::{ - types::{upload::PrepareFile, upload::PresignedUploadRequest, BuildCompression, BuildKind}, + types::{upload::PrepareFile, upload::PresignedUploadRequest, BuildCompression, BuildAllocationType, BuildKind}, utils, }; @@ -15,6 +15,8 @@ pub struct Input { pub content: Content, pub kind: BuildKind, pub compression: BuildCompression, + pub allocation_type: BuildAllocationType, + pub allocation_total_slots: u64, } #[derive(Debug)] @@ -158,10 +160,12 @@ pub async fn get(ctx: &OperationCtx, input: &Input) -> GlobalResult { image_tag, create_ts, kind, - compression + compression, + allocation_type, + allocation_total_slots ) VALUES - ($1, $2, $3, $4, $5, $6, $7, $8, $9) + ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11) ", build_id, game_id, @@ -172,6 +176,8 @@ pub async fn get(ctx: &OperationCtx, input: &Input) -> GlobalResult { ctx.ts(), input.kind as i32, input.compression as i32, + input.allocation_type as i32, + input.allocation_total_slots as i64, ) .await?; diff --git a/packages/core/services/build/src/ops/get.rs b/packages/core/services/build/src/ops/get.rs index 0c65322698..33098e7fe9 100644 --- a/packages/core/services/build/src/ops/get.rs +++ b/packages/core/services/build/src/ops/get.rs @@ -24,6 +24,8 @@ pub(crate) struct BuildRow { create_ts: i64, kind: i64, compression: i64, + allocation_type: i64, + allocation_total_slots: i64, tags: sqlx::types::Json>, } @@ -43,6 +45,10 @@ impl TryInto for BuildRow { compression: unwrap!(types::BuildCompression::from_repr( self.compression.try_into()? )), + allocation_type: unwrap!(types::BuildAllocationType::from_repr( + self.allocation_type.try_into()? + )), + allocation_total_slots: self.allocation_total_slots.try_into()?, // Filter out null values on tags tags: serde_json::from_str::>>(self.tags.0.get())? .into_iter() @@ -74,6 +80,8 @@ pub async fn build_get(ctx: &OperationCtx, input: &Input) -> GlobalResult $2 ", diff --git a/packages/core/services/build/src/types.rs b/packages/core/services/build/src/types.rs index 4ac2092826..392d408360 100644 --- a/packages/core/services/build/src/types.rs +++ b/packages/core/services/build/src/types.rs @@ -50,9 +50,19 @@ pub struct Build { pub create_ts: i64, pub kind: BuildKind, pub compression: BuildCompression, + pub allocation_type: BuildAllocationType, + pub allocation_total_slots: u64, pub tags: HashMap, } +#[derive(Clone, Copy, Debug, Serialize, Deserialize, Hash, PartialEq, Eq, FromRepr)] +#[serde(rename_all = "snake_case")] +pub enum BuildAllocationType { + None = 0, + Single = 1, + Multi = 2, +} + // TODO: Move to upload pkg when its converted to new ops pub mod upload { use std::convert::TryInto; diff --git a/packages/core/services/cluster/src/workflows/server/install/install_scripts/files/pegboard_configure.sh b/packages/core/services/cluster/src/workflows/server/install/install_scripts/files/pegboard_configure.sh index 285f41d62c..baf710048c 100644 --- a/packages/core/services/cluster/src/workflows/server/install/install_scripts/files/pegboard_configure.sh +++ b/packages/core/services/cluster/src/workflows/server/install/install_scripts/files/pegboard_configure.sh @@ -2,8 +2,12 @@ PUBLIC_IP=$(ip -4 route get 1.0.0.0 | awk '{print $7; exit}') +SUBNET_IPV4="172.26.64.0/20" +SUBNET_IPV4_GATEWAY_IP="172.26.64.1" +SUBNET_IPV6="fd00:db8:2::/64" + # MARK: Pegboard config -cat << 'EOF' > /etc/rivet-client/config.json +cat << EOF > /etc/rivet-client/config.json { "client": { "cluster": { @@ -16,7 +20,7 @@ cat << 'EOF' > /etc/rivet-client/config.json } }, "runner": { - "flavor": "__FLAVOR__" + "ip": "$SUBNET_IPV4_GATEWAY_IP", }, "images": { "pull_addresses": { @@ -57,8 +61,6 @@ EOF # # See Nomad equivalent: https://github.com/hashicorp/nomad/blob/a8f0f2612ef9d283ed903721f8453a0c0c3f51c5/client/allocrunner/networking_bridge_linux.go#L73 ADMIN_CHAIN="RIVET-ADMIN" -SUBNET_IPV4="172.26.64.0/20" -SUBNET_IPV6="fd00:db8:2::/64" # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # diff --git a/packages/edge/api/actor/src/route/actors.rs b/packages/edge/api/actor/src/route/actors.rs index 7cdb2f23f4..736541554f 100644 --- a/packages/edge/api/actor/src/route/actors.rs +++ b/packages/edge/api/actor/src/route/actors.rs @@ -150,106 +150,210 @@ pub async fn create( .map(ApiInto::api_into); tracing::info!(?actor_id, ?tags, "creating actor with tags"); + + if let build::types::BuildAllocationType::None = build.allocation_type { + let allocated_fut = if network.wait_ready.unwrap_or_default() { + std::future::pending().boxed() + } else { + let mut allocated_sub = ctx + .subscribe::(("actor_id", actor_id)) + .await?; - let allocated_fut = if network.wait_ready.unwrap_or_default() { - std::future::pending().boxed() - } else { - let mut allocated_sub = ctx - .subscribe::(("actor_id", actor_id)) + async move { allocated_sub.next().await }.boxed() + }; + let mut ready_sub = ctx + .subscribe::(("actor_id", actor_id)) + .await?; + let mut fail_sub = ctx + .subscribe::(("actor_id", actor_id)) + .await?; + let mut destroy_sub = ctx + .subscribe::(("actor_id", actor_id)) .await?; - async move { allocated_sub.next().await }.boxed() - }; - let mut ready_sub = ctx - .subscribe::(("actor_id", actor_id)) - .await?; - let mut fail_sub = ctx - .subscribe::(("actor_id", actor_id)) - .await?; - let mut destroy_sub = ctx - .subscribe::(("actor_id", actor_id)) + ctx.workflow(pegboard::workflows::actor::Input { + actor_id, + env_id, + tags, + resources, + lifecycle: body.lifecycle.map(|x| (*x).api_into()).unwrap_or_else(|| { + pegboard::types::ActorLifecycle { + kill_timeout_ms: 0, + durable: false, + } + }), + image_id: build.build_id, + root_user_enabled: game_config.root_user_enabled, + // args: body.runtime.arguments.unwrap_or_default(), + args: Vec::new(), + network_mode: network.mode.unwrap_or_default().api_into(), + environment: body.runtime.and_then(|r| r.environment).unwrap_or_default().as_hashable(), + network_ports: network + .ports + .unwrap_or_default() + .into_iter() + .map(|(s, p)| GlobalResult::Ok(( + s.clone(), + pegboard::workflows::actor::Port { + internal_port: p.internal_port.map(TryInto::try_into).transpose()?, + routing: if let Some(routing) = p.routing { + match *routing { + models::ActorsPortRouting { + guard: Some(_gg), + host: None, + } => pegboard::types::Routing::GameGuard { + protocol: p.protocol.api_into(), + }, + models::ActorsPortRouting { + guard: None, + host: Some(_), + } => pegboard::types::Routing::Host { + protocol: match p.protocol.api_try_into() { + Err(err) if GlobalError::is(&err, formatted_error::code::ACTOR_FAILED_TO_CREATE) => { + // Add location + bail_with!( + ACTOR_FAILED_TO_CREATE, + error = format!("network.ports[{s:?}].protocol: Host port protocol must be either TCP or UDP.") + ); + } + x => x?, + }, + }, + models::ActorsPortRouting { .. } => { + bail_with!( + ACTOR_FAILED_TO_CREATE, + error = format!("network.ports[{s:?}].routing: Must specify either `guard` or `host` routing type.") + ); + } + } + } else { + pegboard::types::Routing::GameGuard { + protocol: p.protocol.api_into(), + } + } + } + ))) + .collect::>>()?, + endpoint_type, + }) + .tag("actor_id", actor_id) + .dispatch() .await?; - - ctx.workflow(pegboard::workflows::actor::Input { - actor_id, - env_id, - tags, - resources, - lifecycle: body.lifecycle.map(|x| (*x).api_into()).unwrap_or_else(|| { - pegboard::types::ActorLifecycle { - kill_timeout_ms: 0, - durable: false, + + // Wait for allocated/ready, fail, or destroy + tokio::select! { + res = allocated_fut => { res?; }, + res = ready_sub.next() => { res?; }, + res = fail_sub.next() => { + let msg = res?; + bail_with!(ACTOR_FAILED_TO_CREATE, error = msg.message); } - }), - image_id: build.build_id, - root_user_enabled: game_config.root_user_enabled, - // args: body.runtime.arguments.unwrap_or_default(), - args: Vec::new(), - network_mode: network.mode.unwrap_or_default().api_into(), - environment: body.runtime.and_then(|r| r.environment).unwrap_or_default().as_hashable(), - network_ports: network - .ports - .unwrap_or_default() - .into_iter() - .map(|(s, p)| GlobalResult::Ok(( - s.clone(), - pegboard::workflows::actor::Port { - internal_port: p.internal_port.map(TryInto::try_into).transpose()?, - routing: if let Some(routing) = p.routing { - match *routing { - models::ActorsPortRouting { - guard: Some(_gg), - host: None, - } => pegboard::types::Routing::GameGuard { - protocol: p.protocol.api_into(), - }, - models::ActorsPortRouting { - guard: None, - host: Some(_), - } => pegboard::types::Routing::Host { - protocol: match p.protocol.api_try_into() { - Err(err) if GlobalError::is(&err, formatted_error::code::ACTOR_FAILED_TO_CREATE) => { - // Add location - bail_with!( - ACTOR_FAILED_TO_CREATE, - error = format!("network.ports[{s:?}].protocol: Host port protocol must be either TCP or UDP.") - ); - } - x => x?, + res = destroy_sub.next() => { + res?; + bail_with!(ACTOR_FAILED_TO_CREATE, error = "Actor failed before reaching a ready state."); + } + } + } else { + let allocated_fut = if network.wait_ready.unwrap_or_default() { + std::future::pending().boxed() + } else { + let mut allocated_sub = ctx + .subscribe::(("actor_id", actor_id)) + .await?; + + async move { allocated_sub.next().await }.boxed() + }; + let mut ready_sub = ctx + .subscribe::(("actor_id", actor_id)) + .await?; + let mut fail_sub = ctx + .subscribe::(("actor_id", actor_id)) + .await?; + let mut destroy_sub = ctx + .subscribe::(("actor_id", actor_id)) + .await?; + + ctx.workflow(pegboard::workflows::actor2::Input { + actor_id, + env_id, + tags, + resources, + lifecycle: body.lifecycle.map(|x| (*x).api_into()).unwrap_or_else(|| { + pegboard::types::ActorLifecycle { + kill_timeout_ms: 0, + durable: false, + } + }), + image_id: build.build_id, + root_user_enabled: game_config.root_user_enabled, + // args: body.runtime.arguments.unwrap_or_default(), + args: Vec::new(), + network_mode: network.mode.unwrap_or_default().api_into(), + environment: body.runtime.and_then(|r| r.environment).unwrap_or_default().as_hashable(), + network_ports: network + .ports + .unwrap_or_default() + .into_iter() + .map(|(s, p)| GlobalResult::Ok(( + s.clone(), + pegboard::workflows::actor2::Port { + internal_port: p.internal_port.map(TryInto::try_into).transpose()?, + routing: if let Some(routing) = p.routing { + match *routing { + models::ActorsPortRouting { + guard: Some(_gg), + host: None, + } => pegboard::types::Routing::GameGuard { + protocol: p.protocol.api_into(), + }, + models::ActorsPortRouting { + guard: None, + host: Some(_), + } => pegboard::types::Routing::Host { + protocol: match p.protocol.api_try_into() { + Err(err) if GlobalError::is(&err, formatted_error::code::ACTOR_FAILED_TO_CREATE) => { + // Add location + bail_with!( + ACTOR_FAILED_TO_CREATE, + error = format!("network.ports[{s:?}].protocol: Host port protocol must be either TCP or UDP.") + ); + } + x => x?, + }, }, - }, - models::ActorsPortRouting { .. } => { - bail_with!( - ACTOR_FAILED_TO_CREATE, - error = format!("network.ports[{s:?}].routing: Must specify either `guard` or `host` routing type.") - ); + models::ActorsPortRouting { .. } => { + bail_with!( + ACTOR_FAILED_TO_CREATE, + error = format!("network.ports[{s:?}].routing: Must specify either `guard` or `host` routing type.") + ); + } + } + } else { + pegboard::types::Routing::GameGuard { + protocol: p.protocol.api_into(), } - } - } else { - pegboard::types::Routing::GameGuard { - protocol: p.protocol.api_into(), } } - } - ))) - .collect::>>()?.as_hashable(), - endpoint_type, - }) - .tag("actor_id", actor_id) - .dispatch() - .await?; - - // Wait for allocated/ready, fail, or destroy - tokio::select! { - res = allocated_fut => { res?; }, - res = ready_sub.next() => { res?; }, - res = fail_sub.next() => { - let msg = res?; - bail_with!(ACTOR_FAILED_TO_CREATE, error = msg.message); - } - res = destroy_sub.next() => { - res?; - bail_with!(ACTOR_FAILED_TO_CREATE, error = "Actor failed before reaching a ready state."); + ))) + .collect::>>()?, + endpoint_type, + }) + .tag("actor_id", actor_id) + .dispatch() + .await?; + + // Wait for create/ready, fail, or destroy + tokio::select! { + res = allocated_fut => { res?; }, + res = ready_sub.next() => { res?; }, + res = fail_sub.next() => { + let msg = res?; + bail_with!(ACTOR_FAILED_TO_CREATE, error = msg.message); + } + res = destroy_sub.next() => { + res?; + bail_with!(ACTOR_FAILED_TO_CREATE, error = "Actor failed before reaching a ready state."); + } } } diff --git a/packages/edge/infra/client/config/Cargo.toml b/packages/edge/infra/client/config/Cargo.toml index 86e8d497c0..1cf7ca7d12 100644 --- a/packages/edge/infra/client/config/Cargo.toml +++ b/packages/edge/infra/client/config/Cargo.toml @@ -6,6 +6,7 @@ license.workspace = true edition.workspace = true [dependencies] +ipnet = { version = "2.10.1", features = ["serde"] } schemars = { version = "0.8.21", features = ["url", "uuid1"] } serde = { version = "1.0.195", features = ["derive"] } url = "2.2.2" diff --git a/packages/edge/infra/client/config/src/isolate_runner/actor.rs b/packages/edge/infra/client/config/src/isolate_runner/actor.rs deleted file mode 100644 index e4a06daeec..0000000000 --- a/packages/edge/infra/client/config/src/isolate_runner/actor.rs +++ /dev/null @@ -1,28 +0,0 @@ -use std::collections::HashMap; - -use pegboard::protocol; -use serde::{Deserialize, Serialize}; - -/// Config for running an isolate. Similar to runc config. -#[derive(Serialize, Deserialize)] -pub struct Config { - pub resources: Resources, - pub ports: HashMap, - pub env: HashMap, - pub metadata: protocol::Raw, - pub vector_socket_addr: Option, -} - -#[derive(Serialize, Deserialize)] -pub struct Resources { - /// Bytes. - pub memory: u64, - /// Bytes. - pub memory_max: u64, -} - -#[derive(Serialize, Deserialize)] -pub struct Port { - pub target: u16, - pub protocol: protocol::TransportProtocol, -} diff --git a/packages/edge/infra/client/config/src/isolate_runner/mod.rs b/packages/edge/infra/client/config/src/isolate_runner/mod.rs deleted file mode 100644 index 64f8913086..0000000000 --- a/packages/edge/infra/client/config/src/isolate_runner/mod.rs +++ /dev/null @@ -1,13 +0,0 @@ -use std::{net::SocketAddr, path::PathBuf}; - -use serde::{Deserialize, Serialize}; - -pub mod actor; - -#[derive(Clone, Serialize, Deserialize)] -pub struct Config { - pub actors_path: PathBuf, - pub manager_ws_addr: SocketAddr, - - pub foundationdb: crate::manager::FoundationDb, -} diff --git a/packages/edge/infra/client/config/src/lib.rs b/packages/edge/infra/client/config/src/lib.rs index 024438915b..ea1ae0bf06 100644 --- a/packages/edge/infra/client/config/src/lib.rs +++ b/packages/edge/infra/client/config/src/lib.rs @@ -1,5 +1,3 @@ -pub mod isolate_runner; mod manager; pub mod runner_protocol; -pub mod utils; pub use manager::*; diff --git a/packages/edge/infra/client/config/src/manager.rs b/packages/edge/infra/client/config/src/manager.rs index 787e7f8a06..040e2c30c9 100644 --- a/packages/edge/infra/client/config/src/manager.rs +++ b/packages/edge/infra/client/config/src/manager.rs @@ -5,7 +5,6 @@ use std::{ time::Duration, }; -use pegboard::protocol; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use url::Url; @@ -83,7 +82,6 @@ pub struct Cluster { #[derive(Clone, Deserialize, JsonSchema)] #[serde(rename_all = "snake_case", deny_unknown_fields)] pub struct Runner { - pub flavor: protocol::ClientFlavor, /// Whether or not to use a mount for actor file systems. pub use_mounts: Option, @@ -100,7 +98,6 @@ pub struct Runner { pub port: Option, pub container_runner_binary_path: Option, - pub isolate_runner_binary_path: Option, /// Custom host entries to append to /etc/hosts in actor containers. #[serde(default)] @@ -126,12 +123,6 @@ impl Runner { .unwrap_or_else(|| Path::new("/usr/local/bin/rivet-container-runner").into()) } - pub fn isolate_runner_binary_path(&self) -> PathBuf { - self.isolate_runner_binary_path - .clone() - .unwrap_or_else(|| Path::new("/usr/local/bin/rivet-isolate-v8-runner").into()) - } - pub fn custom_hosts(&self) -> &[HostEntry] { self.custom_hosts.as_deref().unwrap_or(&[]) } diff --git a/packages/edge/infra/client/config/src/runner_protocol.rs b/packages/edge/infra/client/config/src/runner_protocol.rs index bb89439720..9fd1e64a50 100644 --- a/packages/edge/infra/client/config/src/runner_protocol.rs +++ b/packages/edge/infra/client/config/src/runner_protocol.rs @@ -1,18 +1,42 @@ use serde::{Deserialize, Serialize}; use uuid::Uuid; +use pegboard::protocol; -#[derive(Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", deny_unknown_fields)] +pub enum ToManager { + Init { + runner_id: Uuid, + }, + ActorStateUpdate { + actor_id: Uuid, + generation: u32, + state: ActorState, + } +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", deny_unknown_fields)] pub enum ToRunner { - Start { + StartActor { actor_id: Uuid, generation: u32, + env: protocol::HashableMap, + metadata: protocol::Raw, }, - Signal { + SignalActor { actor_id: Uuid, generation: u32, signal: i32, persist_storage: bool, }, - // Kills the runner process - Terminate, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", deny_unknown_fields)] +pub enum ActorState { + Running, + Exited { + exit_code: Option, + }, } diff --git a/packages/edge/infra/client/config/src/utils.rs b/packages/edge/infra/client/config/src/utils.rs deleted file mode 100644 index 8952e8057b..0000000000 --- a/packages/edge/infra/client/config/src/utils.rs +++ /dev/null @@ -1,11 +0,0 @@ -// IMPORTANT: This cannot be just `rivet-` because this is used as a prefix to filter cgroup names -// in cadvisor. -// -// If this was "rivet-", we'd have to report on non-actor cgropus with cadvisor. -// -// See also packages/core/services/cluster/src/workflows/server/install/install_scripts/files/cadvisor_metric_exporter.sh & packages/core/api/actor/src/route/metrics.rs -pub const RIVET_CONTAINER_PREFIX: &str = "pegboard-actor-"; - -pub fn format_container_id(actor_id: &str, generation: u32) -> String { - format!("{RIVET_CONTAINER_PREFIX}{actor_id}-{generation}") -} diff --git a/packages/edge/infra/client/container-runner/src/container.rs b/packages/edge/infra/client/container-runner/src/container.rs index 17da27e621..bb03f52876 100644 --- a/packages/edge/infra/client/container-runner/src/container.rs +++ b/packages/edge/infra/client/container-runner/src/container.rs @@ -22,15 +22,15 @@ const MAX_PREVIEW_LINES: usize = 128; /// Returns the exit code of the container that will be passed to the parent pub fn run( msg_tx: Option>, - actor_path: &Path, + runner_path: &Path, container_id: &str, root_user_enabled: bool, ) -> Result { - // Extract actor id from path - let actor_id = actor_path + // Extract runner id from path + let runner_id = runner_path .iter() .last() - .context("empty `actor_path`")? + .context("empty `runner_path`")? .to_string_lossy() .to_string(); let fs_path = actor_path.join("fs").join("upper"); @@ -66,7 +66,7 @@ pub fn run( // Spawn runc container println!( "Starting container {} with OCI bundle {}", - actor_id, + runner_id, fs_path.display() ); @@ -90,7 +90,7 @@ pub fn run( let container_id2 = container_id.to_owned(); thread::spawn(move || { for _ in signals.forever() { - println!("Received SIGTERM, forwarding to runc container {actor_id}"); + println!("Received SIGTERM, forwarding to runc container {runner_id}"); let status = Command::new("runc") .arg("kill") .arg("--all") diff --git a/packages/edge/infra/client/container-runner/src/log_shipper.rs b/packages/edge/infra/client/container-runner/src/log_shipper.rs index 32354594a2..8dabafbd39 100644 --- a/packages/edge/infra/client/container-runner/src/log_shipper.rs +++ b/packages/edge/infra/client/container-runner/src/log_shipper.rs @@ -37,7 +37,7 @@ pub struct LogShipper { pub vector_socket_addr: String, - pub actor_id: String, + pub runner_id: String, pub env_id: Uuid, } @@ -92,8 +92,8 @@ impl LogShipper { println!("Log shipper connected"); while let Result::Ok(message) = self.msg_rx.recv() { - let vector_message = VectorMessage::Actors { - actor_id: self.actor_id.as_str(), + let vector_message = VectorMessage::Runners { + runner_id: self.runner_id.as_str(), env_id: self.env_id, stream_type: message.stream_type as u8, ts: message.ts, @@ -114,9 +114,9 @@ impl LogShipper { #[derive(Serialize)] #[serde(tag = "source")] enum VectorMessage<'a> { - #[serde(rename = "actors")] - Actors { - actor_id: &'a str, + #[serde(rename = "runners")] + Runners { + runner_id: &'a str, env_id: Uuid, stream_type: u8, ts: u64, diff --git a/packages/edge/infra/client/container-runner/src/main.rs b/packages/edge/infra/client/container-runner/src/main.rs index 0f207e569a..7018694d5a 100644 --- a/packages/edge/infra/client/container-runner/src/main.rs +++ b/packages/edge/infra/client/container-runner/src/main.rs @@ -18,15 +18,15 @@ const LOGS_RETENTION: Duration = Duration::from_secs(7 * 24 * 60 * 60); fn main() -> Result<()> { let mut args = std::env::args().skip(1); - let actor_path_str = args.next().context("`actor_path` arg required")?; + let runner_path_str = args.next().context("`runner_path` arg required")?; let container_id = args.next().context("`container_id` arg required")?; - let actor_path = Path::new(&actor_path_str); + let runner_path = Path::new(&runner_path_str); - rivet_logs::Logs::new(actor_path.join("logs"), LOGS_RETENTION).start_sync()?; + rivet_logs::Logs::new(runner_path.join("logs"), LOGS_RETENTION).start_sync()?; // Write PID to file fs::write( - actor_path.join("pid"), + runner_path.join("pid"), std::process::id().to_string().as_bytes(), )?; @@ -36,9 +36,9 @@ fn main() -> Result<()> { .map(|x| x.parse()) .transpose() .context("failed to parse vector socket addr")?; - let actor_id = var("ACTOR_ID")?; + let runner_id = var("RUNNER_ID")?; let env_id = Uuid::parse_str(&var("ENVIRONMENT_ID")?)?; - println!("Starting actor_id={actor_id} env_id={env_id} vector_socket_addr={} root_user_enabled={root_user_enabled}", vector_socket_addr.as_ref().map(|x| x.as_str()).unwrap_or("?")); + println!("Starting runner_id={runner_id} env_id={env_id} vector_socket_addr={} root_user_enabled={root_user_enabled}", vector_socket_addr.as_ref().map(|x| x.as_str()).unwrap_or("?")); let (shutdown_tx, shutdown_rx) = mpsc::sync_channel(1); @@ -50,7 +50,7 @@ fn main() -> Result<()> { shutdown_rx, msg_rx, vector_socket_addr, - actor_id, + runner_id, env_id, }; let log_shipper_thread = log_shipper.spawn(); @@ -112,7 +112,7 @@ fn main() -> Result<()> { } fs::write( - actor_path.join("exit-code"), + runner_path.join("exit-code"), exit_code.to_string().as_bytes(), )?; diff --git a/packages/edge/infra/client/echo/Cargo.toml b/packages/edge/infra/client/echo/Cargo.toml index ffe4a2e3b6..5527ff3f5f 100644 --- a/packages/edge/infra/client/echo/Cargo.toml +++ b/packages/edge/infra/client/echo/Cargo.toml @@ -6,4 +6,11 @@ authors = ["Rivet Gaming, LLC "] license = "Apache-2.0" [dependencies] -tiny_http = "0.12" +bytes = "1.0" +futures-util = "0.3" +http = "0.2" +serde_json = "1.0" +tokio = { version = "1.40", features = ["full",] } +tokio-tungstenite = "0.23.1" +uuid = { version = "1", features = ["v4", "serde"] } +warp = "0.3.7" diff --git a/packages/edge/infra/client/echo/src/main.rs b/packages/edge/infra/client/echo/src/main.rs index 6fe6a71f2b..58475b6260 100644 --- a/packages/edge/infra/client/echo/src/main.rs +++ b/packages/edge/infra/client/echo/src/main.rs @@ -1,30 +1,145 @@ -use tiny_http::{Response, Server, StatusCode}; +use std::{env, net::SocketAddr, sync::Arc, time::Duration}; -// TODO: This can't pick up SIGTERM -fn main() { - println!("Env:"); - for (key, value) in std::env::vars() { +use futures_util::{SinkExt, StreamExt}; +use serde_json::json; +use tokio::sync::Mutex; +use tokio_tungstenite::{connect_async, tungstenite::protocol::Message}; +use uuid::Uuid; +use warp::Filter; + +const PING_INTERVAL: Duration = Duration::from_secs(1); + +#[tokio::main] +async fn main() { + // Print all environment variables + println!("Environment variables:"); + for (key, value) in env::vars() { println!(" {}: {}", key, value); } - let port = std::env::var("PORT_MAIN").expect("no PORT_MAIN"); - let addr = format!("0.0.0.0:{port}"); - let server = Server::http(&addr).unwrap(); - println!("Listening on {addr}"); - - for mut request in server.incoming_requests() { - println!("req"); - - let mut content = Vec::new(); - request.as_reader().read_to_end(&mut content).unwrap(); - - let response = Response::new( - StatusCode(200), - Vec::new(), - std::io::Cursor::new(content), - request.body_length(), - None, - ); - request.respond(response).unwrap(); + // Get manager connection details from env vars + let manager_ip = env::var("RIVET_MANAGER_IP").expect("RIVET_MANAGER_IP not set"); + let manager_port = env::var("RIVET_MANAGER_PORT").expect("RIVET_MANAGER_PORT not set"); + let manager_addr = format!("ws://{}:{}", manager_ip, manager_port); + + // Get HTTP server port from env var or use default + let http_port = env::var("PORT_MAIN") + .expect("PORT_MAIN not set") + .parse::() + .expect("bad PORT_MAIN"); + + // Spawn the WebSocket client + tokio::spawn(async move { + if let Err(e) = run_websocket_client(&manager_addr).await { + eprintln!("WebSocket client error: {}", e); + } + }); + + // Start the HTTP server + let http_addr: SocketAddr = ([0, 0, 0, 0], http_port).into(); + println!("Starting HTTP server on {}", http_addr); + + // Define the echo route + let echo = warp::any().and(warp::body::bytes()).map(|body| { + println!("Received HTTP request"); + + http::response::Builder::new() + .status(warp::http::StatusCode::OK) + .body(body) + .unwrap() + }); + + // Start the server + warp::serve(echo).run(http_addr).await; +} + +async fn run_websocket_client(url: &str) -> Result<(), Box> { + println!("Connecting to WebSocket at {}", url); + + // Connect to the WebSocket server + let (ws_stream, _) = connect_async(url).await?; + println!("WebSocket connection established"); + + // Split the stream + let (mut write, mut read) = ws_stream.split(); + + let payload = json!({ + "init": { + "runner_id": Uuid::nil(), + }, + }); + + let data = serde_json::to_vec(&payload)?; + write.send(Message::Binary(data)).await?; + println!("Sent init message"); + + // Ping thread + let write = Arc::new(Mutex::new(write)); + let write2 = write.clone(); + tokio::spawn(async move { + loop { + tokio::time::sleep(PING_INTERVAL).await; + + if write2 + .lock() + .await + .send(Message::Ping(Vec::new())) + .await + .is_err() + { + break; + } + } + }); + + // Process incoming messages + while let Some(message) = read.next().await { + match message { + Ok(msg) => match msg { + Message::Pong(_) => {} + Message::Binary(buf) => { + let packet = serde_json::from_slice::(&buf)?; + println!("Received packet: {packet:?}"); + + if let Some(packet) = packet.get("start_actor") { + let payload = json!({ + "actor_state_update": { + "actor_id": packet["actor_id"], + "generation": packet["generation"], + "state": { + "running": null, + }, + }, + }); + + let data = serde_json::to_vec(&payload)?; + write.lock().await.send(Message::Binary(data)).await?; + } else if let Some(packet) = packet.get("signal_actor") { + let payload = json!({ + "actor_state_update": { + "actor_id": packet["actor_id"], + "generation": packet["generation"], + "state": { + "exited": { + "exit_code": null, + }, + }, + }, + }); + + let data = serde_json::to_vec(&payload)?; + write.lock().await.send(Message::Binary(data)).await?; + } + } + msg => eprintln!("Unexpected message: {msg:?}"), + }, + Err(e) => { + eprintln!("Error reading message: {}", e); + break; + } + } } + + println!("WebSocket connection closed"); + Ok(()) } diff --git a/packages/edge/infra/client/isolate-v8-runner/Cargo.toml b/packages/edge/infra/client/isolate-v8-runner/Cargo.toml deleted file mode 100644 index 3821841245..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/Cargo.toml +++ /dev/null @@ -1,47 +0,0 @@ -[package] -name = "pegboard-isolate-v8-runner" -version.workspace = true -authors.workspace = true -license.workspace = true -edition.workspace = true - -[[bin]] -name = "rivet-isolate-v8-runner" -path = "src/main.rs" - -[dependencies] -anyhow.workspace = true -deno_ast = "0.42.1" -deno_core.workspace = true -deno_runtime.workspace = true -fdb-util.workspace = true -foundationdb.workspace = true -futures-util = { version = "0.3" } -netif = "0.1.6" -nix.workspace = true -pegboard-actor-kv = { workspace = true } -pegboard-config.workspace = true -pegboard.workspace = true -rivet-api.workspace = true -rivet-convert.workspace = true -rivet-logs.workspace = true -rivet-runtime.workspace = true -serde = { version = "1.0.195", features = ["derive"] } -serde_json = "1.0.111" -service-discovery.workspace = true -signal-hook = "0.3.17" -tempfile = "3.13.0" -tokio-tungstenite = "0.23.1" -tokio.workspace = true -tracing-logfmt.workspace = true -tracing-subscriber.workspace = true -tracing.workspace = true -twox-hash = "1.6.3" -uuid = { version = "1.6.1", features = ["v4"] } -rustls = "0.23.25" - -[dev-dependencies] -portpicker = "0.1.1" -tempfile = "3.9.0" -uuid = { version = "1.6.1", features = ["v4"] } - diff --git a/packages/edge/infra/client/isolate-v8-runner/Dockerfile b/packages/edge/infra/client/isolate-v8-runner/Dockerfile deleted file mode 100644 index 1cbd5d5a24..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -# IMPORTANT: This version is required for GLIBC 2.31 (used by edge servers on Linode) -FROM rust:1.82.0-bullseye AS rust - -WORKDIR /app -COPY . . - -# Installs shared libs -# -# The FDB version should match `cluster::workflows::server::install::install_scripts::components::fdb::FDB_VERSION` -RUN \ - apt-get update -y && \ - apt-get install -y \ - libclang-dev protobuf-compiler && \ - curl -Lf -o /lib/libfdb_c.so "https://github.com/apple/foundationdb/releases/download/7.1.60/libfdb_c.x86_64.so" - -RUN \ - --mount=type=cache,target=/root/.cargo/git \ - --mount=type=cache,target=/root/.cargo/registry \ - --mount=type=cache,target=/app/target \ - RUSTFLAGS="--cfg tokio_unstable" cargo build --target x86_64-unknown-linux-gnu --bin rivet-isolate-v8-runner && \ - mkdir -p /app/dist && \ - mv /app/target/x86_64-unknown-linux-gnu/debug/rivet-isolate-v8-runner /app/dist/rivet-isolate-v8-runner - -# Create an empty image and copy binaries into it to minimize the size of the image -FROM scratch -COPY --from=rust /app/dist/ / - -# Allows `docker create` to work even though this fails -CMD [""] diff --git a/packages/edge/infra/client/isolate-v8-runner/Dockerfile.dockerignore b/packages/edge/infra/client/isolate-v8-runner/Dockerfile.dockerignore deleted file mode 100644 index 0096277683..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/Dockerfile.dockerignore +++ /dev/null @@ -1,8 +0,0 @@ -* - -!Cargo.lock -!Cargo.toml -!packages -!resources/legacy/proto -!sdks/api/full/rust/Cargo.toml -!sdks/api/full/rust/src diff --git a/packages/edge/infra/client/isolate-v8-runner/README.md b/packages/edge/infra/client/isolate-v8-runner/README.md deleted file mode 100644 index 6458169a5c..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# isolate-v8-runner - -This crate is used to run JavaScript on the pegboard servers themselves. This takes care of log shipping, rate -limiting logs, and more. - -In contrast to the container runner which runs a single container per process, this runs multiple isolates in -a single v8 runtime. - -## Deployment - -This gets built & deployed in `infra/tf/infra-artifacts/` then used in `TODO`. diff --git a/packages/edge/infra/client/isolate-v8-runner/js/40_rivet_kv.js b/packages/edge/infra/client/isolate-v8-runner/js/40_rivet_kv.js deleted file mode 100644 index 6b28f41b08..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/js/40_rivet_kv.js +++ /dev/null @@ -1,270 +0,0 @@ -// DO NOT MODIFY -// -// Generated with scripts/sdk_actor/compile_bridge.ts - -import { core } from "ext:core/mod.js"; -import { op_rivet_kv_delete, op_rivet_kv_delete_all, op_rivet_kv_delete_batch, op_rivet_kv_get, op_rivet_kv_get_batch, op_rivet_kv_list, op_rivet_kv_put, op_rivet_kv_put_batch, } from "ext:core/ops"; -import { deepEqual } from "./lib/fast-equals/index.js"; -/** - * Retrieves a value from the key-value store. - */ -export async function get(key, options) { - const entry = await op_rivet_kv_get(serializeKey(key)); - if (entry == null) - return null; - return deserializeValue(key, entry.value, options?.format); -} -/** - * Retrieves a batch of key-value pairs. - */ -export async function getBatch(keys, options) { - const entries = await op_rivet_kv_get_batch(keys.map((x) => serializeKey(x))); - return new HashMap(entries.map(([key, entry]) => { - const jsKey = deserializeKey(key); - return [ - jsKey, - deserializeValue(jsKey, entry.value, options?.format), - ]; - })); -} -/** - * Retrieves all key-value pairs in the KV store. When using any of the options, the keys lexicographic order - * is used for filtering. - * - * @param {ListOptions} [options] - Options. - * @returns {Promise>} The retrieved values. - */ -export async function list(options) { - // Build query - let query; - if (options?.prefix) { - query = { - prefix: serializeListKey(options.prefix), - }; - } - else if (options?.start) { - if (!options.end) { - throw new Error("must set options.end with options.start"); - } - query = { - rangeInclusive: [ - serializeListKey(options.start), - serializeKey(options.end), - ], - }; - } - else if (options?.startAfter) { - if (!options.end) { - throw new Error("must set options.end with options.startAfter"); - } - query = { - rangeExclusive: [ - serializeListKey(options.startAfter), - serializeKey(options.end), - ], - }; - } - else if (options?.end) { - throw new Error("must set options.start or options.startAfter with options.end"); - } - else { - query = { all: {} }; - } - const entries = await op_rivet_kv_list(query, options?.reverse ?? false, options?.limit); - return new HashMap(entries.map(([key, entry]) => { - const jsKey = deserializeKey(key); - return [ - jsKey, - deserializeValue(jsKey, entry.value, options?.format), - ]; - })); -} -/** - * Stores a key-value pair in the key-value store. - * - * @param {Key} key - The key under which the value will be stored. - * @param {Entry | ArrayBuffer} value - The value to be stored, which will be serialized. - * @param {PutOptions} [options] - Options. - * @returns {Promise} A promise that resolves when the operation is complete. - */ -export async function put(key, value, options) { - validateType(value, null, options?.format); - const format = options?.format ?? "value"; - let serializedValue; - if (format === "value") { - serializedValue = core.serialize(value, { forStorage: true }); - } - else if (format === "arrayBuffer") { - if (value instanceof ArrayBuffer) - serializedValue = new Uint8Array(value); - else { - throw new Error(`value must be of type \`ArrayBuffer\` if format is "arrayBuffer"`); - } - } - else { - // Handled by validateType - throw new Error(`unreachable format: \`${format}\``); - } - await op_rivet_kv_put(serializeKey(key), serializedValue); -} -/** - * Stores a batch of key-value pairs. - * - * @param {Map} obj - An object containing key-value pairs to be stored. - * @param {PutBatchOptions} [options] - Options. - * @returns {Promise} A promise that resolves when the batch operation is complete. - */ -export async function putBatch(obj, options) { - const serializedObj = new Map(); - const format = options?.format ?? "value"; - for (const [key, value] of obj) { - validateType(value, key, format); - let serializedValue; - if (format === "value") { - serializedValue = core.serialize(value, { forStorage: true }); - } - else if (format === "arrayBuffer") { - if (value instanceof ArrayBuffer) - serializedValue = new Uint8Array(value); - else { - throw new Error(`value in key "${key}" must be of type \`ArrayBuffer\` if format is "arrayBuffer"`); - } - } - else { - // Handled by validateType - throw new Error(`unreachable format: \`${format}\``); - } - serializedObj.set(serializeKey(key), serializedValue); - } - await op_rivet_kv_put_batch(serializedObj); -} -/** - * Deletes a key-value pair from the key-value store. - * - * @param {Key} key - The key of the key-value pair to delete. - * @returns {Promise} A promise that resolves when the operation is complete. - */ -export async function delete_(key) { - return await op_rivet_kv_delete(serializeKey(key)); -} -/** - * Deletes a batch of key-value pairs from the key-value store. - * - * @param {Key[]} keys - A list of keys to delete. - * @returns {Promise} A promise that resolves when the operation is complete. - */ -export async function deleteBatch(keys) { - return await op_rivet_kv_delete_batch(keys.map((x) => serializeKey(x))); -} -/** - * Deletes all data from the key-value store. **This CANNOT be undone.** - * - * @returns {Promise} A promise that resolves when the operation is complete. - */ -export async function deleteAll() { - return await op_rivet_kv_delete_all(); -} -function validateType(value, key, format = "value") { - const keyText = key ? ` in key "{key}"` : ""; - if (format === "value") { - if (value instanceof Blob) { - throw new Error(`the type ${value.constructor.name}${keyText} is not serializable in Deno, but you can use a TypedArray instead. See https://github.com/denoland/deno/issues/12067#issuecomment-1975001079.`); - } - if (value instanceof CryptoKey || - value instanceof DOMException || - // Not defined in Deno - // value instanceof RTCCertificate || - // We don't load in the canvas ext into the the Deno runtime for Rivet - // value instanceof ImageBitmap || - value instanceof ImageData) { - throw new Error(`the type ${value.constructor.name}${keyText} is not serializable in Deno. See https://github.com/denoland/deno/issues/12067#issuecomment-1975001079.`); - } - } - else if (format === "arrayBuffer") { - if (!(value instanceof ArrayBuffer)) { - throw new Error(`value must be an ArrayBuffer if options.format = "arrayBuffer".`); - } - } - else { - throw new Error("unexpected key type from KV driver"); - } -} -function serializeKey(key) { - if (Array.isArray(key)) { - return { jsInKey: key.map((x) => core.serialize(x)) }; - } - return { jsInKey: [core.serialize(key)] }; -} -function serializeListKey(key) { - if (Array.isArray(key)) { - return key.map((x) => core.serialize(x)); - } - return [core.serialize(key)]; -} -function deserializeKey(key) { - if ("inKey" in key || "outKey" in key) { - const jsKey = key.inKey ?? key.outKey; - const tuple = jsKey.map((x) => core.deserialize(x)); - if (tuple.length === 1) - return tuple[0]; - return tuple; - } - throw new Error("unexpected key type from KV driver"); -} -function deserializeValue(key, value, format = "value") { - if (value === undefined) - return value; - if (format === "value") { - try { - return core.deserialize(value, { forStorage: true }); - } - catch (e) { - throw new Error(`could not deserialize value in key "${key}". you must use options.format = "arrayBuffer".`, { cause: e }); - } - } - else if (format === "arrayBuffer") { - return value.buffer; - } - else { - throw Error(`invalid format: "${format}". expected "value" or "arrayBuffer".`); - } -} -class HashMap { - #internal; - constructor(internal) { - this.#internal = internal; - } - get(key) { - for (const [k, v] of this.#internal) { - if (deepEqual(key, k)) - return v; - } - return undefined; - } - /** - * Returns a map of keys to values. **WARNING** Using `.get` on the returned map does not work as expected - * with complex types (arrays, objects, etc). Use `.get` on this class instead. - */ - raw() { - return new Map(this.#internal); - } - array() { - return this.#internal; - } - entries() { - return this[Symbol.iterator](); - } - [Symbol.iterator]() { - return this.#internal[Symbol.iterator](); - } -} -export const KV_NAMESPACE = { - get, - getBatch, - list, - put, - putBatch, - delete: delete_, - deleteBatch, - deleteAll, -}; diff --git a/packages/edge/infra/client/isolate-v8-runner/js/90_rivet_ns.js b/packages/edge/infra/client/isolate-v8-runner/js/90_rivet_ns.js deleted file mode 100644 index 41b6a7567d..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/js/90_rivet_ns.js +++ /dev/null @@ -1,29 +0,0 @@ -// DO NOT MODIFY -// -// Generated with scripts/sdk_actor/compile_bridge.ts - -import { primordials } from "ext:core/mod.js"; -import { KV_NAMESPACE } from "ext:rivet_kv/40_rivet_kv.js"; -const { ReflectOwnKeys, ObjectFreeze } = primordials; -export function deepFreeze(object) { - // Retrieve the property names defined on object - const propNames = ReflectOwnKeys(object); - // Freeze properties before freezing self - for (const name of propNames) { - // biome-ignore lint/suspicious/noExplicitAny: Unknown object type - const value = object[name]; - // Check if value is an array or object and not null - if (value && - (Array.isArray(value) || - typeof value === "object" || - typeof value === "function")) { - deepFreeze(value); - } - } - return ObjectFreeze(object); -} -export const ACTOR_CONTEXT = { - // Populated at runtime - metadata: undefined, - kv: KV_NAMESPACE, -}; diff --git a/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/comparator.js b/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/comparator.js deleted file mode 100644 index dad51b096a..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/comparator.js +++ /dev/null @@ -1,220 +0,0 @@ -// DO NOT MODIFY -// -// Generated with scripts/sdk_actor/compile_bridge.ts - -import { areArraysEqual as areArraysEqualDefault, areDatesEqual as areDatesEqualDefault, areMapsEqual as areMapsEqualDefault, areObjectsEqual as areObjectsEqualDefault, areObjectsEqualStrict as areObjectsEqualStrictDefault, arePrimitiveWrappersEqual as arePrimitiveWrappersEqualDefault, areRegExpsEqual as areRegExpsEqualDefault, areSetsEqual as areSetsEqualDefault, areTypedArraysEqual, } from "./equals.js"; -import { combineComparators, createIsCircular } from "./utils.js"; -const ARGUMENTS_TAG = "[object Arguments]"; -const BOOLEAN_TAG = "[object Boolean]"; -const DATE_TAG = "[object Date]"; -const MAP_TAG = "[object Map]"; -const NUMBER_TAG = "[object Number]"; -const OBJECT_TAG = "[object Object]"; -const REG_EXP_TAG = "[object RegExp]"; -const SET_TAG = "[object Set]"; -const STRING_TAG = "[object String]"; -const { isArray } = Array; -const isTypedArray = typeof ArrayBuffer === "function" && ArrayBuffer.isView - ? ArrayBuffer.isView - : null; -const { assign } = Object; -const getTag = Object.prototype.toString.call.bind(Object.prototype.toString); -/** - * Create a comparator method based on the type-specific equality comparators passed. - */ -export function createEqualityComparator({ areArraysEqual, areDatesEqual, areMapsEqual, areObjectsEqual, arePrimitiveWrappersEqual, areRegExpsEqual, areSetsEqual, areTypedArraysEqual, }) { - /** - * compare the value of the two objects and return true if they are equivalent in values - */ - return function comparator(a, b, state) { - // If the items are strictly equal, no need to do a value comparison. - if (a === b) { - return true; - } - // If the items are not non-nullish objects, then the only possibility - // of them being equal but not strictly is if they are both `NaN`. Since - // `NaN` is uniquely not equal to itself, we can use self-comparison of - // both objects, which is faster than `isNaN()`. - if (a == null || - b == null || - typeof a !== "object" || - typeof b !== "object") { - return a !== a && b !== b; - } - const constructor = a.constructor; - // Checks are listed in order of commonality of use-case: - // 1. Common complex object types (plain object, array) - // 2. Common data values (date, regexp) - // 3. Less-common complex object types (map, set) - // 4. Less-common data values (promise, primitive wrappers) - // Inherently this is both subjective and assumptive, however - // when reviewing comparable libraries in the wild this order - // appears to be generally consistent. - // Constructors should match, otherwise there is potential for false positives - // between class and subclass or custom object and POJO. - if (constructor !== b.constructor) { - return false; - } - // `isPlainObject` only checks against the object"s own realm. Cross-realm - // comparisons are rare, and will be handled in the ultimate fallback, so - // we can avoid capturing the string tag. - if (constructor === Object) { - return areObjectsEqual(a, b, state); - } - // `isArray()` works on subclasses and is cross-realm, so we can avoid capturing - // the string tag or doing an `instanceof` check. - if (isArray(a)) { - return areArraysEqual(a, b, state); - } - // `isTypedArray()` works on all possible TypedArray classes, so we can avoid - // capturing the string tag or comparing against all possible constructors. - if (isTypedArray != null && isTypedArray(a)) { - return areTypedArraysEqual(a, b, state); - } - // Try to fast-path equality checks for other complex object types in the - // same realm to avoid capturing the string tag. Strict equality is used - // instead of `instanceof` because it is more performant for the common - // use-case. If someone is subclassing a native class, it will be handled - // with the string tag comparison. - if (constructor === Date) { - return areDatesEqual(a, b, state); - } - if (constructor === RegExp) { - return areRegExpsEqual(a, b, state); - } - if (constructor === Map) { - return areMapsEqual(a, b, state); - } - if (constructor === Set) { - return areSetsEqual(a, b, state); - } - // Since this is a custom object, capture the string tag to determing its type. - // This is reasonably performant in modern environments like v8 and SpiderMonkey. - const tag = getTag(a); - if (tag === DATE_TAG) { - return areDatesEqual(a, b, state); - } - if (tag === REG_EXP_TAG) { - return areRegExpsEqual(a, b, state); - } - if (tag === MAP_TAG) { - return areMapsEqual(a, b, state); - } - if (tag === SET_TAG) { - return areSetsEqual(a, b, state); - } - if (tag === OBJECT_TAG) { - // The exception for value comparison is custom `Promise`-like class instances. These should - // be treated the same as standard `Promise` objects, which means strict equality, and if - // it reaches this point then that strict equality comparison has already failed. - return (typeof a.then !== "function" && - typeof b.then !== "function" && - areObjectsEqual(a, b, state)); - } - // If an arguments tag, it should be treated as a standard object. - if (tag === ARGUMENTS_TAG) { - return areObjectsEqual(a, b, state); - } - // As the penultimate fallback, check if the values passed are primitive wrappers. This - // is very rare in modern JS, which is why it is deprioritized compared to all other object - // types. - if (tag === BOOLEAN_TAG || tag === NUMBER_TAG || tag === STRING_TAG) { - return arePrimitiveWrappersEqual(a, b, state); - } - // If not matching any tags that require a specific type of comparison, then we hard-code false because - // the only thing remaining is strict equality, which has already been compared. This is for a few reasons: - // - Certain types that cannot be introspected (e.g., `WeakMap`). For these types, this is the only - // comparison that can be made. - // - For types that can be introspected, but rarely have requirements to be compared - // (`ArrayBuffer`, `DataView`, etc.), the cost is avoided to prioritize the common - // use-cases (may be included in a future release, if requested enough). - // - For types that can be introspected but do not have an objective definition of what - // equality is (`Error`, etc.), the subjective decision is to be conservative and strictly compare. - // In all cases, these decisions should be reevaluated based on changes to the language and - // common development practices. - return false; - }; -} -/** - * Create the configuration object used for building comparators. - */ -export function createEqualityComparatorConfig({ circular, createCustomConfig, strict, }) { - let config = { - areArraysEqual: strict - ? areObjectsEqualStrictDefault - : areArraysEqualDefault, - areDatesEqual: areDatesEqualDefault, - areMapsEqual: strict - ? combineComparators(areMapsEqualDefault, areObjectsEqualStrictDefault) - : areMapsEqualDefault, - areObjectsEqual: strict - ? areObjectsEqualStrictDefault - : areObjectsEqualDefault, - arePrimitiveWrappersEqual: arePrimitiveWrappersEqualDefault, - areRegExpsEqual: areRegExpsEqualDefault, - areSetsEqual: strict - ? combineComparators(areSetsEqualDefault, areObjectsEqualStrictDefault) - : areSetsEqualDefault, - areTypedArraysEqual: strict - ? areObjectsEqualStrictDefault - : areTypedArraysEqual, - }; - if (createCustomConfig) { - config = assign({}, config, createCustomConfig(config)); - } - if (circular) { - const areArraysEqual = createIsCircular(config.areArraysEqual); - const areMapsEqual = createIsCircular(config.areMapsEqual); - const areObjectsEqual = createIsCircular(config.areObjectsEqual); - const areSetsEqual = createIsCircular(config.areSetsEqual); - config = assign({}, config, { - areArraysEqual, - areMapsEqual, - areObjectsEqual, - areSetsEqual, - }); - } - return config; -} -/** - * Default equality comparator pass-through, used as the standard `isEqual` creator for - * use inside the built comparator. - */ -export function createInternalEqualityComparator(compare) { - return (a, b, _indexOrKeyA, _indexOrKeyB, _parentA, _parentB, state) => compare(a, b, state); -} -/** - * Create the `isEqual` function used by the consuming application. - */ -export function createIsEqual({ circular, comparator, createState, equals, strict, }) { - if (createState) { - return function isEqual(a, b) { - const { cache = circular ? new WeakMap() : undefined, meta } = createState(); - return comparator(a, b, { - cache, - equals, - meta, - strict, - }); - }; - } - if (circular) { - return function isEqual(a, b) { - return comparator(a, b, { - cache: new WeakMap(), - equals, - meta: undefined, - strict, - }); - }; - } - const state = { - cache: undefined, - equals, - meta: undefined, - strict, - }; - return function isEqual(a, b) { - return comparator(a, b, state); - }; -} diff --git a/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/equals.js b/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/equals.js deleted file mode 100644 index ab7b4d6270..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/equals.js +++ /dev/null @@ -1,201 +0,0 @@ -// DO NOT MODIFY -// -// Generated with scripts/sdk_actor/compile_bridge.ts - -import { getStrictProperties, hasOwn, sameValueZeroEqual } from "./utils.js"; -const OWNER = "_owner"; -const { getOwnPropertyDescriptor, keys } = Object; -/** - * Whether the arrays are equal in value. - */ -export function areArraysEqual(a, b, state) { - let index = a.length; - if (b.length !== index) { - return false; - } - while (index-- > 0) { - if (!state.equals(a[index], b[index], index, index, a, b, state)) { - return false; - } - } - return true; -} -/** - * Whether the dates passed are equal in value. - */ -export function areDatesEqual(a, b) { - return sameValueZeroEqual(a.getTime(), b.getTime()); -} -/** - * Whether the `Map`s are equal in value. - */ -export function areMapsEqual(a, b, state) { - if (a.size !== b.size) { - return false; - } - const matchedIndices = {}; - const aIterable = a.entries(); - let index = 0; - let aResult; - let bResult; - while ((aResult = aIterable.next())) { - if (aResult.done) { - break; - } - const bIterable = b.entries(); - let hasMatch = false; - let matchIndex = 0; - while ((bResult = bIterable.next())) { - if (bResult.done) { - break; - } - const [aKey, aValue] = aResult.value; - const [bKey, bValue] = bResult.value; - if (!hasMatch && - !matchedIndices[matchIndex] && - (hasMatch = - state.equals(aKey, bKey, index, matchIndex, a, b, state) && - state.equals(aValue, bValue, aKey, bKey, a, b, state))) { - matchedIndices[matchIndex] = true; - } - matchIndex++; - } - if (!hasMatch) { - return false; - } - index++; - } - return true; -} -/** - * Whether the objects are equal in value. - */ -export function areObjectsEqual(a, b, state) { - const properties = keys(a); - let index = properties.length; - if (keys(b).length !== index) { - return false; - } - let property; - // Decrementing `while` showed faster results than either incrementing or - // decrementing `for` loop and than an incrementing `while` loop. Declarative - // methods like `some` / `every` were not used to avoid incurring the garbage - // cost of anonymous callbacks. - while (index-- > 0) { - property = properties[index]; - if (property === OWNER && - (a.$$typeof || b.$$typeof) && - a.$$typeof !== b.$$typeof) { - return false; - } - if (!hasOwn(b, property) || - !state.equals(a[property], b[property], property, property, a, b, state)) { - return false; - } - } - return true; -} -/** - * Whether the objects are equal in value with strict property checking. - */ -export function areObjectsEqualStrict(a, b, state) { - const properties = getStrictProperties(a); - let index = properties.length; - if (getStrictProperties(b).length !== index) { - return false; - } - let property; - let descriptorA; - let descriptorB; - // Decrementing `while` showed faster results than either incrementing or - // decrementing `for` loop and than an incrementing `while` loop. Declarative - // methods like `some` / `every` were not used to avoid incurring the garbage - // cost of anonymous callbacks. - while (index-- > 0) { - property = properties[index]; - if (property === OWNER && - (a.$$typeof || b.$$typeof) && - a.$$typeof !== b.$$typeof) { - return false; - } - if (!hasOwn(b, property)) { - return false; - } - if (!state.equals(a[property], b[property], property, property, a, b, state)) { - return false; - } - descriptorA = getOwnPropertyDescriptor(a, property); - descriptorB = getOwnPropertyDescriptor(b, property); - if ((descriptorA || descriptorB) && - (!descriptorA || - !descriptorB || - descriptorA.configurable !== descriptorB.configurable || - descriptorA.enumerable !== descriptorB.enumerable || - descriptorA.writable !== descriptorB.writable)) { - return false; - } - } - return true; -} -/** - * Whether the primitive wrappers passed are equal in value. - */ -export function arePrimitiveWrappersEqual(a, b) { - return sameValueZeroEqual(a.valueOf(), b.valueOf()); -} -/** - * Whether the regexps passed are equal in value. - */ -export function areRegExpsEqual(a, b) { - return a.source === b.source && a.flags === b.flags; -} -/** - * Whether the `Set`s are equal in value. - */ -export function areSetsEqual(a, b, state) { - if (a.size !== b.size) { - return false; - } - const matchedIndices = {}; - const aIterable = a.values(); - let aResult; - let bResult; - while ((aResult = aIterable.next())) { - if (aResult.done) { - break; - } - const bIterable = b.values(); - let hasMatch = false; - let matchIndex = 0; - while ((bResult = bIterable.next())) { - if (bResult.done) { - break; - } - if (!hasMatch && - !matchedIndices[matchIndex] && - (hasMatch = state.equals(aResult.value, bResult.value, aResult.value, bResult.value, a, b, state))) { - matchedIndices[matchIndex] = true; - } - matchIndex++; - } - if (!hasMatch) { - return false; - } - } - return true; -} -/** - * Whether the TypedArray instances are equal in value. - */ -export function areTypedArraysEqual(a, b) { - let index = a.length; - if (b.length !== index) { - return false; - } - while (index-- > 0) { - if (a[index] !== b[index]) { - return false; - } - } - return true; -} diff --git a/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/index.js b/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/index.js deleted file mode 100644 index 10cc81e0fd..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/index.js +++ /dev/null @@ -1,74 +0,0 @@ -// DO NOT MODIFY -// -// Generated with scripts/sdk_actor/compile_bridge.ts - -import { createEqualityComparator, createEqualityComparatorConfig, createInternalEqualityComparator, createIsEqual, } from "./comparator.js"; -import { sameValueZeroEqual } from "./utils.js"; -export { sameValueZeroEqual }; -export * from "./internalTypes.js"; -/** - * Whether the items passed are deeply-equal in value. - */ -export const deepEqual = createCustomEqual(); -/** - * Whether the items passed are deeply-equal in value based on strict comparison. - */ -export const strictDeepEqual = createCustomEqual({ strict: true }); -/** - * Whether the items passed are deeply-equal in value, including circular references. - */ -export const circularDeepEqual = createCustomEqual({ circular: true }); -/** - * Whether the items passed are deeply-equal in value, including circular references, - * based on strict comparison. - */ -export const strictCircularDeepEqual = createCustomEqual({ - circular: true, - strict: true, -}); -/** - * Whether the items passed are shallowly-equal in value. - */ -export const shallowEqual = createCustomEqual({ - createInternalComparator: () => sameValueZeroEqual, -}); -/** - * Whether the items passed are shallowly-equal in value based on strict comparison - */ -export const strictShallowEqual = createCustomEqual({ - strict: true, - createInternalComparator: () => sameValueZeroEqual, -}); -/** - * Whether the items passed are shallowly-equal in value, including circular references. - */ -export const circularShallowEqual = createCustomEqual({ - circular: true, - createInternalComparator: () => sameValueZeroEqual, -}); -/** - * Whether the items passed are shallowly-equal in value, including circular references, - * based on strict comparison. - */ -export const strictCircularShallowEqual = createCustomEqual({ - circular: true, - createInternalComparator: () => sameValueZeroEqual, - strict: true, -}); -/** - * Create a custom equality comparison method. - * - * This can be done to create very targeted comparisons in extreme hot-path scenarios - * where the standard methods are not performant enough, but can also be used to provide - * support for legacy environments that do not support expected features like - * `RegExp.prototype.flags` out of the box. - */ -export function createCustomEqual(options = {}) { - const { circular = false, createInternalComparator: createCustomInternalComparator, createState, strict = false, } = options; - const config = createEqualityComparatorConfig(options); - const comparator = createEqualityComparator(config); - const equals = createCustomInternalComparator - ? createCustomInternalComparator(comparator) - : createInternalEqualityComparator(comparator); - return createIsEqual({ circular, comparator, createState, equals, strict }); -} diff --git a/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/internalTypes.js b/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/internalTypes.js deleted file mode 100644 index 25eac70081..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/internalTypes.js +++ /dev/null @@ -1,5 +0,0 @@ -// DO NOT MODIFY -// -// Generated with scripts/sdk_actor/compile_bridge.ts - -export {}; diff --git a/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/utils.js b/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/utils.js deleted file mode 100644 index 52c753f13e..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/js/lib/fast-equals/utils.js +++ /dev/null @@ -1,56 +0,0 @@ -// DO NOT MODIFY -// -// Generated with scripts/sdk_actor/compile_bridge.ts - -const { getOwnPropertyNames, getOwnPropertySymbols } = Object; -const { hasOwnProperty } = Object.prototype; -/** - * Combine two comparators into a single comparators. - */ -export function combineComparators(comparatorA, comparatorB) { - return function isEqual(a, b, state) { - return comparatorA(a, b, state) && comparatorB(a, b, state); - }; -} -/** - * Wrap the provided `areItemsEqual` method to manage the circular state, allowing - * for circular references to be safely included in the comparison without creating - * stack overflows. - */ -export function createIsCircular(areItemsEqual) { - return function isCircular(a, b, state) { - if (!a || !b || typeof a !== "object" || typeof b !== "object") { - return areItemsEqual(a, b, state); - } - const { cache } = state; - const cachedA = cache.get(a); - const cachedB = cache.get(b); - if (cachedA && cachedB) { - return cachedA === b && cachedB === a; - } - cache.set(a, b); - cache.set(b, a); - const result = areItemsEqual(a, b, state); - cache.delete(a); - cache.delete(b); - return result; - }; -} -/** - * Get the properties to strictly examine, which include both own properties that are - * not enumerable and symbol properties. - */ -export function getStrictProperties(object) { - return getOwnPropertyNames(object).concat(getOwnPropertySymbols(object)); -} -/** - * Whether the object contains the property passed as an own property. - */ -export const hasOwn = Object.hasOwn || - ((object, property) => hasOwnProperty.call(object, property)); -/** - * Whether the values passed are strictly equal or both NaN. - */ -export function sameValueZeroEqual(a, b) { - return a || b ? a === b : a === b || (a !== a && b !== b); -} diff --git a/packages/edge/infra/client/isolate-v8-runner/src/ext/kv.rs b/packages/edge/infra/client/isolate-v8-runner/src/ext/kv.rs deleted file mode 100644 index e55a3242dc..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/src/ext/kv.rs +++ /dev/null @@ -1,196 +0,0 @@ -use std::{collections::HashMap, future::Future, sync::Arc}; - -use deno_core::{error::AnyError, op2, JsBuffer, OpState, ToJsBuffer}; -use pegboard_actor_kv as actor_kv; -use serde::Serialize; - -type FakeMap = Box<[(T, U)]>; - -deno_core::extension!( - rivet_kv, - ops = [ - op_rivet_kv_get, - op_rivet_kv_get_batch, - op_rivet_kv_list, - op_rivet_kv_put, - op_rivet_kv_put_batch, - op_rivet_kv_delete, - op_rivet_kv_delete_batch, - op_rivet_kv_delete_all, - ], - esm = [ - dir "js", - // Order matters - "lib/fast-equals/utils.js", - "lib/fast-equals/equals.js", - "lib/fast-equals/comparator.js", - "lib/fast-equals/internalTypes.js", - "lib/fast-equals/index.js", - - "40_rivet_kv.js", - ], - options = { - kv: actor_kv::ActorKv, - }, - state = |state, options| { - state.put::>(Arc::new(options.kv)); - }, -); - -#[derive(Serialize)] -#[serde(rename_all = "camelCase")] -enum Key { - InKey(Vec), - OutKey(Vec), -} - -impl From for Key { - fn from(value: actor_kv::key::Key) -> Self { - match value { - actor_kv::key::Key::JsInKey(tuple) => Key::InKey(tuple), - actor_kv::key::Key::JsOutKey(tuple) => { - Key::OutKey(tuple.into_iter().map(Into::into).collect()) - } - } - } -} - -#[derive(Serialize)] -#[serde(rename_all = "camelCase")] -struct Entry { - metadata: Metadata, - value: ToJsBuffer, -} - -impl From for Entry { - fn from(value: actor_kv::Entry) -> Self { - Entry { - metadata: value.metadata.into(), - value: value.value.into(), - } - } -} - -#[derive(Serialize)] -#[serde(rename_all = "camelCase")] -pub struct Metadata { - pub kv_version: ToJsBuffer, - pub create_ts: i64, -} - -impl From for Metadata { - fn from(value: actor_kv::Metadata) -> Self { - Metadata { - kv_version: value.kv_version.into(), - create_ts: value.create_ts, - } - } -} - -#[op2(async)] -#[serde] -pub fn op_rivet_kv_get( - state: &mut OpState, - #[serde] key: actor_kv::key::Key, -) -> Result, AnyError>>, AnyError> { - let kv = state.borrow::>().clone(); - - Ok(async move { - let res = kv.get(vec![key.into()]).await?; - - Ok(res.into_values().next().map(Into::into)) - }) -} - -#[op2(async)] -#[serde] -pub fn op_rivet_kv_get_batch( - state: &mut OpState, - #[serde] keys: Vec, -) -> Result, AnyError>>, AnyError> { - let kv = state.borrow::>().clone(); - - Ok(async move { - let res = kv - .get(keys.into_iter().map(Into::into).collect()) - .await? - .into_iter() - .map(|(k, v)| (k.into(), v.into())) - .collect(); - - Ok(res) - }) -} - -#[op2(async)] -#[serde] -pub fn op_rivet_kv_list( - state: &mut OpState, - #[serde] query: actor_kv::ListQuery, - reverse: bool, - limit: Option, -) -> Result, AnyError>>, AnyError> { - let kv = state.borrow::>().clone(); - - Ok(async move { - let res = kv - .list(query.into(), reverse, limit.map(|x| x as usize)) - .await? - .into_iter() - .map(|(k, v)| (k.into(), v.into())) - .collect(); - - Ok(res) - }) -} - -#[op2(async)] -pub fn op_rivet_kv_put( - state: &mut OpState, - #[serde] key: actor_kv::key::Key, - #[buffer] value: JsBuffer, -) -> Result>, AnyError> { - let kv = state.borrow::>().clone(); - - Ok(async move { kv.put([(key, value)].into()).await }) -} - -#[op2(async)] -pub fn op_rivet_kv_put_batch( - state: &mut OpState, - #[serde] obj: HashMap, -) -> Result>, AnyError> { - let kv = state.borrow::>().clone(); - - Ok(async move { kv.put(obj).await }) -} - -#[op2(async)] -pub fn op_rivet_kv_delete( - state: &mut OpState, - #[serde] key: actor_kv::key::Key, -) -> Result>, AnyError> { - let kv = state.borrow::>().clone(); - - Ok(async move { kv.delete(vec![key]).await }) -} - -#[op2(async)] -#[serde] -pub fn op_rivet_kv_delete_batch( - state: &mut OpState, - #[serde] keys: Vec, -) -> Result>, AnyError> { - let kv = state.borrow::>().clone(); - - Ok(async move { kv.delete(keys).await }) -} - -#[op2(async)] -pub fn op_rivet_kv_delete_all( - state: &mut OpState, -) -> Result>, AnyError> { - let kv = state.borrow::>().clone(); - - Ok(async move { kv.delete_all().await }) -} diff --git a/packages/edge/infra/client/isolate-v8-runner/src/ext/mod.rs b/packages/edge/infra/client/isolate-v8-runner/src/ext/mod.rs deleted file mode 100644 index 80d8020f2b..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/src/ext/mod.rs +++ /dev/null @@ -1,2 +0,0 @@ -pub mod kv; -pub mod runtime; diff --git a/packages/edge/infra/client/isolate-v8-runner/src/ext/runtime.rs b/packages/edge/infra/client/isolate-v8-runner/src/ext/runtime.rs deleted file mode 100644 index c87c6b92ad..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/src/ext/runtime.rs +++ /dev/null @@ -1,11 +0,0 @@ -deno_core::extension!( - rivet_runtime, - deps = [ - rivet_kv - ], - esm_entry_point = "ext:rivet_runtime/90_rivet_ns.js", - esm = [ - dir "js", - "90_rivet_ns.js" - ], -); diff --git a/packages/edge/infra/client/isolate-v8-runner/src/isolate.rs b/packages/edge/infra/client/isolate-v8-runner/src/isolate.rs deleted file mode 100644 index 576cb5a0b5..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/src/isolate.rs +++ /dev/null @@ -1,648 +0,0 @@ -use std::{ - fs::File, - io::{BufReader, Write}, - net::Ipv4Addr, - os::fd::FromRawFd, - path::{Path, PathBuf}, - rc::Rc, - result::Result::{Err, Ok}, - sync::{mpsc as smpsc, Arc}, - thread::JoinHandle, -}; - -use anyhow::*; -use deno_core::{ - error::JsError, v8, v8::CreateParams, ModuleId, ModuleSpecifier, StaticModuleLoader, -}; -use deno_runtime::{ - deno_fs::InMemoryFs, - deno_io::{Stdio, StdioPipe}, - deno_permissions::{ - self, NetListenDescriptor, Permissions, PermissionsContainer, UnaryPermission, - }, - permissions::RuntimePermissionDescriptorParser, - worker::{MainWorker, MainWorkerTerminateHandle, WorkerOptions, WorkerServiceOptions}, -}; -use nix::{libc, unistd::pipe}; -use pegboard::protocol; -use pegboard_actor_kv::ActorKv; -use pegboard_config::isolate_runner as config; -use tokio::{fs, sync::mpsc}; -use utils::FdbPool; -use uuid::Uuid; - -use crate::{ext, log_shipper, metadata::JsMetadata, utils}; - -pub fn run( - config: config::Config, - fdb_pool: FdbPool, - actor_id: Uuid, - generation: u32, - terminate_tx: mpsc::Sender, -) -> Result<()> { - let actor_path = config.actors_path.join(format!("{actor_id}-{generation}")); - - // Write PID to file - std::fs::write( - actor_path.join("pid"), - std::process::id().to_string().as_bytes(), - )?; - - // Read config - let config_data = std::fs::read_to_string(actor_path.join("config.json")) - .context("Failed to read config file")?; - let actor_config = serde_json::from_str::(&config_data) - .context("Failed to parse config file")?; - - let (shutdown_tx, shutdown_rx) = smpsc::sync_channel(1); - - // Start log shipper - let (msg_tx, log_shipper_thread) = - if let Some(vector_socket_addr) = &actor_config.vector_socket_addr { - let (msg_tx, msg_rx) = smpsc::sync_channel::( - log_shipper::MAX_BUFFER_BYTES / log_shipper::MAX_LINE_BYTES, - ); - let metadata = actor_config.metadata.deserialize()?; - let log_shipper = log_shipper::LogShipper { - actor_id, - shutdown_rx, - msg_rx, - vector_socket_addr: vector_socket_addr.clone(), - env_id: metadata.environment.env_id, - }; - let log_shipper_thread = log_shipper.spawn(); - - (Some(msg_tx), Some(log_shipper_thread)) - } else { - (None, None) - }; - - // Run the isolate - let exit_code = match utils::tokio::create_and_run_current_thread(run_inner( - fdb_pool, - actor_path.clone(), - actor_id, - generation, - terminate_tx, - msg_tx.clone(), - actor_config, - ))? { - Result::Ok(exit_code) => exit_code, - Err(err) => { - tracing::error!(?actor_id, ?generation, "Run isolate failed: {err:?}"); - log_shipper::send_message( - actor_id, - &msg_tx, - None, - log_shipper::StreamType::StdErr, - "Fatal error. Aborting.".into(), - ); - - Some(1) - } - }; - - // Shutdown all threads - match shutdown_tx.send(()) { - Result::Ok(_) => { - tracing::info!(?actor_id, ?generation, "Sent shutdown signal"); - } - Err(err) => { - tracing::error!( - ?actor_id, - ?generation, - "Failed to send shutdown signal: {err:?}" - ); - } - } - - // Wait for log shipper to finish - drop(msg_tx); - if let Some(log_shipper_thread) = log_shipper_thread { - match log_shipper_thread.join() { - Result::Ok(_) => {} - Err(err) => { - tracing::error!(?actor_id, ?generation, "Log shipper failed: {err:?}") - } - } - } - - // Write exit code. None is written as no bytes - if let Some(code) = exit_code { - std::fs::write(actor_path.join("exit-code"), code.to_string().as_bytes())?; - } else { - std::fs::write(actor_path.join("exit-code"), &[])?; - } - - Ok(()) -} - -pub async fn run_inner( - fdb_pool: FdbPool, - actor_path: PathBuf, - actor_id: Uuid, - generation: u32, - terminate_tx: mpsc::Sender, - msg_tx: Option>, - actor_config: config::actor::Config, -) -> Result> { - tracing::info!(?actor_id, ?generation, "starting isolate"); - - // Init KV store (create or open) - let mut kv = ActorKv::new((&*fdb_pool).clone(), actor_id); - kv.init().await?; - - tracing::info!(?actor_id, ?generation, "isolate kv initialized"); - - // Should match the path from `Actor::make_fs` in manager/src/actor/setup.rs - let index = actor_path.join("fs").join("upper").join("index.js"); - - // Load index.js - let index_script_content = match fs::read_to_string(&index).await { - Ok(c) => c, - Err(err) => { - tracing::error!(?err, "Failed to load {}", index.display()); - - log_shipper::send_message( - actor_id, - &msg_tx, - None, - log_shipper::StreamType::StdErr, - "Failed to load /index.js".into(), - ); - - return Ok(Some(1)); - } - }; - - // Load script into a static module loader. No dynamic scripts can be loaded this way. - let index_module = ModuleSpecifier::from_file_path(Path::new("/index.js")) - .map_err(|_| anyhow!("invalid file name"))?; - let loader = StaticModuleLoader::new([(index_module.clone(), index_script_content)]); - - // TODO(RVT-4192): Replace with a custom fs that only reads from actor_path/fs - let fs = Arc::new(InMemoryFs::default()); - - // Build permissions - let permission_desc_parser = Arc::new(RuntimePermissionDescriptorParser::new(fs.clone())); - let mut permissions = Permissions::none_without_prompt(); - - // Outbound traffic - permissions.net = UnaryPermission::allow_all(); - // Sockets - let loopback = Ipv4Addr::new(0, 0, 0, 0); - permissions.net_listen = Permissions::new_unary::( - Some( - actor_config - .ports - .iter() - .map(|(_, port)| { - NetListenDescriptor::from_ipv4( - loopback, - Some(port.target), - match port.protocol { - protocol::TransportProtocol::Tcp => deno_permissions::Protocol::Tcp, - protocol::TransportProtocol::Udp => deno_permissions::Protocol::Udp, - }, - ) - }) - .collect(), - ), - None, - false, - ); - // We use a custom in-memory env - permissions.env = UnaryPermission::allow_all(); - - // Create pipes - let (stdout_read_fd, stdout_write_fd) = pipe()?; - let (stderr_read_fd, stderr_write_fd) = pipe()?; - - // SAFETY: These are created by pipes - let stdout_reader = std::fs::File::from(stdout_read_fd); - let stdout_writer = std::fs::File::from(stdout_write_fd); - let stderr_reader = std::fs::File::from(stderr_read_fd); - let stderr_writer = std::fs::File::from(stderr_write_fd); - let mut stderr_writer2 = stderr_writer.try_clone()?; - - let isolate_stdout = BufReader::new(stdout_reader); - let isolate_stderr = BufReader::new(stderr_reader); - - // Ship stdout & stderr logs - let stdout_handle = log_shipper::ship_logs( - actor_id, - msg_tx.clone(), - log_shipper::StreamType::StdOut, - isolate_stdout, - ); - let stderr_handle = log_shipper::ship_logs( - actor_id, - msg_tx.clone(), - log_shipper::StreamType::StdErr, - isolate_stderr, - ); - - // Build worker. If this errors its likely a problem with the runtime and not user input - let mut worker = MainWorker::try_bootstrap_from_options( - index_module.clone(), - WorkerServiceOptions { - module_loader: Rc::new(loader), - permissions: PermissionsContainer::new(permission_desc_parser, permissions), - blob_store: Default::default(), - broadcast_channel: Default::default(), - feature_checker: Default::default(), - node_services: Default::default(), - npm_process_state_provider: Default::default(), - root_cert_store_provider: Default::default(), - fetch_dns_resolver: Default::default(), - shared_array_buffer_store: Default::default(), - compiled_wasm_module_store: Default::default(), - v8_code_cache: Default::default(), - fs, - }, - WorkerOptions { - extensions: vec![ - ext::kv::rivet_kv::init_ops_and_esm(kv), - ext::runtime::rivet_runtime::init_ops_and_esm(), - ], - // Configure memory limits - create_params: { - fn floor_align(value: usize, alignment: usize) -> usize { - value & !(alignment - 1) - } - - // Memory must be aligned with PAGESIZE - let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }.try_into()?; - let mem = floor_align(actor_config.resources.memory.try_into()?, page_size); - let mem_max = floor_align(actor_config.resources.memory_max.try_into()?, page_size); - - Some(CreateParams::default().heap_limits(mem, mem_max)) - }, - stdio: Stdio { - // TODO: Make this read from /dev/null instead - stdin: StdioPipe::inherit(), - stdout: StdioPipe::file(stdout_writer), - stderr: StdioPipe::file(stderr_writer), - }, - env: actor_config.env.clone(), - ..Default::default() - }, - )?; - - // Send terminate handle to watcher task - terminate_tx.send(worker.terminate_handle().clone()).await?; - drop(terminate_tx); - - // First step preloads the module. This can throw a JS error from certain syntax. - match worker.preload_main_module(&index_module).await { - Ok(module_id) => { - tracing::info!(?actor_id, ?generation, "Isolate ready"); - - // Second step evaluates the module but does not run it (because its sync). - let res = worker.evaluate_module_sync(module_id); - - if worker.is_terminated() { - tracing::info!(?actor_id, ?generation, "Isolate terminated"); - } else { - if let Err(err) = res { - tracing::info!(?actor_id, ?generation, "Isolate evaluation failed"); - - runtime_error(&mut stderr_writer2, &mut worker, err)?; - } else { - // Call `start` - match handle_entrypoint(actor_config, &mut worker, module_id) { - Ok(()) => { - // Third step runs event loop until stopped. We do this even after an error in - // case a beforeunload event handler was registered. - loop { - let res = worker.run_event_loop(Default::default()).await; - - if worker.is_terminated() { - tracing::info!(?actor_id, ?generation, "Isolate terminated"); - break; - } - - if let Err(err) = res { - tracing::info!( - ?actor_id, - ?generation, - "Isolate execution failed" - ); - - runtime_error(&mut stderr_writer2, &mut worker, err)?; - } - - // We dispatch the beforeunload event then run the event loop again - match worker.dispatch_beforeunload_event() { - Ok(web_continue) => { - if !web_continue { - break; - } - } - Err(err) => { - tracing::info!( - ?actor_id, - ?generation, - "Dispatch beforeunload event failed" - ); - - runtime_error(&mut stderr_writer2, &mut worker, err)?; - - break; - } - } - } - } - Err(err) => runtime_error(&mut stderr_writer2, &mut worker, err)?, - } - } - } - } - Err(err) => { - tracing::info!(?actor_id, ?generation, "Isolate preload failed"); - - match err.downcast::() { - // JS error - Ok(err) => runtime_error(&mut stderr_writer2, &mut worker, err.into())?, - Err(err) => { - // Also JS error - if deno_core::error::get_custom_error_class(&err).is_some() { - runtime_error(&mut stderr_writer2, &mut worker, err)?; - } - // Fatal error - else { - return Err(err); - } - } - } - } - } - - // For good measure - worker.v8_isolate().terminate_execution(); - - tracing::info!(?actor_id, ?generation, "Isolate complete"); - - let exit_code = if worker.is_terminated() { - None - } else { - Some(worker.exit_code()) - }; - - // Drop worker and writer so the stdout and stderr pipes close - drop(worker); - - wait_logs_complete( - actor_id, - generation, - stderr_writer2, - stdout_handle, - stderr_handle, - )?; - - Ok(exit_code) -} - -// Reads the `start` function from the default export of index.js and calls it. -fn handle_entrypoint( - actor_config: config::actor::Config, - worker: &mut MainWorker, - index_module_id: ModuleId, -) -> Result<()> { - let mm = worker.js_runtime.module_map(); - let scope = &mut worker.js_runtime.handle_scope(); - - // Get index.js mod - let g_ns = mm.get_module_namespace(scope, index_module_id)?; - let ns = v8::Local::new(scope, g_ns); - - // Get default export - let default_export_name = v8::String::new(scope, "default").context("v8 primitive")?; - let default_export = ns - .get(scope, default_export_name.into()) - .context("default export")? - .to_object(scope) - .context( - "Missing default export at index.js. Try: export default { start(ctx) { ... } }", - )?; - - // Get `start` export - let start_export_name = v8::String::new(scope, "start").context("v8 primitive")?; - let start_export = default_export - .get(scope, start_export_name.into()) - .context("Invalid `start` function in default export")?; - - // Parse `start` as function - let start_func = v8::Local::::try_from(start_export) - .context("Invalid `start` function in default export")?; - - // Get rivet ns - let rivet_ns_module_id = mm - .get_id( - &"ext:rivet_runtime/90_rivet_ns.js", - deno_core::RequestedModuleType::None, - ) - .context("ns should be loaded")?; - let rivet_g_ns = mm.get_module_namespace(scope, rivet_ns_module_id)?; - let rivet_ns = v8::Local::new(scope, rivet_g_ns); - - // Get deep freeze function - let deep_freeze_name = v8::String::new(scope, "deepFreeze").context("v8 primitive")?; - let deep_freeze = rivet_ns - .get(scope, deep_freeze_name.into()) - .context("deepFreeze")?; - let deep_freeze = v8::Local::::try_from(deep_freeze).context("deepFreeze")?; - - // Get actor context from ns - let ctx_export_name = v8::String::new(scope, "ACTOR_CONTEXT").context("v8 primitive")?; - let ctx_export = rivet_ns - .get(scope, ctx_export_name.into()) - .context("runtime export")? - .to_object(scope) - .context("ns is object")?; - - // Serialize metadata - let metadata = JsMetadata::from_actor(actor_config, scope)?; - let metadata = deno_core::serde_v8::to_v8(scope, metadata)?; - - // Add metadata - let metadata_key = v8::String::new(scope, "metadata") - .context("v8 primitive")? - .into(); - ctx_export.set(scope, metadata_key, metadata); - - // Freeze ctx - let frozen_ctx = deep_freeze - .call(scope, rivet_ns.into(), &[ctx_export.into()]) - .context("deepFreeze call")?; - - // Call `start` function - let res = start_func.call(scope, default_export.into(), &[frozen_ctx]); - - // Make sure `start` function async - match res { - Some(promise) if promise.is_promise() => {} - _ => bail!("`start` function must be async"), - } - - Ok(()) -} - -fn runtime_error(stderr_writer: &mut File, worker: &mut MainWorker, err: Error) -> Result<()> { - // Write final error to stderr - stderr_writer.write_all(err.to_string().as_bytes())?; - - // Update error code if not already errored - if worker.exit_code() == 0 { - worker.set_exit_code(1); - } - - Ok(()) -} - -/// Waits for logs to be written and log shipper threads to complete. -fn wait_logs_complete( - actor_id: Uuid, - generation: u32, - mut stderr_writer2: File, - stdout_handle: JoinHandle<()>, - stderr_handle: JoinHandle<()>, -) -> Result<()> { - stderr_writer2.flush()?; - drop(stderr_writer2); - - // Wait for threads to finish - match stdout_handle.join() { - Result::Ok(_) => {} - Err(err) => { - tracing::error!(?actor_id, ?generation, "stdout thread panicked: {err:?}"); - } - } - match stderr_handle.join() { - Result::Ok(_) => {} - Err(err) => { - tracing::error!(?actor_id, ?generation, "stderr thread panicked: {err:?}"); - } - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use std::{net::SocketAddr, path::Path, result::Result::Ok}; - - use anyhow::*; - use deno_runtime::worker::MainWorkerTerminateHandle; - use foundationdb as fdb; - use pegboard::protocol; - use pegboard_config::isolate_runner as config; - use tracing_subscriber::prelude::*; - use uuid::Uuid; - - use super::run_inner; - use crate::utils; - - // TODO: Currently requires an fdb container to be running already - #[tokio::test] - async fn test_isolate() -> Result<()> { - tracing_subscriber::registry() - .with( - tracing_logfmt::builder() - .with_ansi_color(true) - .layer() - .with_filter(tracing_subscriber::filter::LevelFilter::INFO), - ) - .init(); - - let tmp_dir = tempfile::TempDir::new().unwrap(); - let actors_path = tmp_dir.path().join("actors"); - let actor_id = Uuid::nil(); - let generation = 0; - - let fs_path = actors_path - .join(format!("{actor_id}-{generation}")) - .join("fs") - .join("upper"); - std::fs::create_dir_all(&fs_path)?; - - std::fs::copy( - Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/index.js"), - fs_path.join("index.js"), - )?; - std::fs::write(tmp_dir.path().join("fdb.cluster"), "fdb:fdb@127.0.0.1:4500")?; - - let config = config::Config { - // Not important - actors_path: Path::new("").to_path_buf(), - manager_ws_addr: SocketAddr::from(([0, 0, 0, 0], 0)), - }; - - deno_core::v8_set_flags(vec![ - // Binary name - "UNUSED_BUT_NECESSARY_ARG0".into(), - // Disable eval - "--disallow-code-generation-from-strings".into(), - ]); - - // Start FDB network thread - fdb_util::init(&tmp_dir.path().join("fdb.cluster")); - - // For receiving the terminate handle - let (terminate_tx, _terminate_rx) = - tokio::sync::mpsc::channel::(1); - - let actor_config = config::actor::Config { - resources: config::actor::Resources { - memory: 26843545600, - memory_max: 26843545600, - }, - ports: Default::default(), - env: Default::default(), - metadata: protocol::Raw::new(&protocol::ActorMetadata { - actor: protocol::ActorMetadataActor { - actor_id: Uuid::nil(), - tags: [("foo".to_string(), "bar".to_string())] - .into_iter() - .collect(), - create_ts: 0, - }, - project: protocol::ActorMetadataProject { - project_id: Uuid::nil(), - slug: "foo".into(), - }, - environment: protocol::ActorMetadataEnvironment { - env_id: Uuid::nil(), - slug: "foo".into(), - }, - datacenter: protocol::ActorMetadataDatacenter { - name_id: "local".to_string(), - display_name: "Local".to_string(), - }, - cluster: protocol::ActorMetadataCluster { - cluster_id: Uuid::nil(), - }, - build: protocol::ActorMetadataBuild { - build_id: Uuid::nil(), - }, - }) - .unwrap(), - vector_socket_addr: Default::default(), - }; - - let exit_code = run_inner( - config, - actors_path.join(actor_id.to_string()).to_path_buf(), - actor_id, - generation, - terminate_tx, - None, - actor_config, - ) - .await?; - - ensure!(exit_code == 0); - - Ok(()) - } -} diff --git a/packages/edge/infra/client/isolate-v8-runner/src/log_shipper.rs b/packages/edge/infra/client/isolate-v8-runner/src/log_shipper.rs deleted file mode 100644 index 17b89f1d46..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/src/log_shipper.rs +++ /dev/null @@ -1,292 +0,0 @@ -use std::{ - io::{BufRead, Write}, - net::TcpStream, - sync::mpsc, - thread::JoinHandle, - time::{Duration, SystemTime, UNIX_EPOCH}, -}; - -use anyhow::*; -use serde::Serialize; -use serde_json; -use uuid::Uuid; - -use crate::throttle; - -/// Maximum length of a single log line -pub const MAX_LINE_BYTES: usize = 1024; - -/// Maximum number of bytes to buffer before dropping logs -pub const MAX_BUFFER_BYTES: usize = 1024 * 1024; - -/// Maximum number of lines to print to stdout for debugging. This helps -/// identify the reasons for program crashes from the isolate's output. -const MAX_PREVIEW_LINES: usize = 128; - -#[derive(Copy, Clone, Debug)] -#[repr(u8)] -pub enum StreamType { - StdOut = 0, - StdErr = 1, -} - -pub struct ReceivedMessage { - pub stream_type: StreamType, - pub ts: u64, - pub message: String, -} - -/// Sends logs from the container to the Vector agent on the machine. -/// -/// This will run until the `msg_rx` sender is dropped before shutting down. -/// -/// If attempting to reconnect while the runner is shut down, this will exit immediately, dropping -/// all logs in the process. This is to ensure that if Vector becomes unreachable, we don't end up -/// with a lot of lingering runners that refuse to exit. -pub struct LogShipper { - pub actor_id: Uuid, - - /// Notifies of process shutdown. - pub shutdown_rx: mpsc::Receiver<()>, - - /// Receiver for messages to be shipped. This holds a buffer of messages waiting to be send. - /// - /// If the socket closes or creates back pressure, logs will be dropped on the main thread when - /// trying to send to this channel. - pub msg_rx: mpsc::Receiver, - - pub vector_socket_addr: String, - - pub env_id: Uuid, -} - -impl LogShipper { - pub fn spawn(self) -> JoinHandle<()> { - std::thread::spawn(move || self.run()) - } - - fn run(self) { - // Retry loop - loop { - match self.run_inner() { - Result::Ok(()) => { - tracing::info!(actor_id=?self.actor_id, "Exiting log shipper"); - break; - } - Err(err) => { - tracing::error!(actor_id=?self.actor_id, "Log shipper error: {err:?}"); - - // Wait before attempting to reconnect. Wait for disconnect in this time - // period. - match self - .shutdown_rx - .recv_timeout(std::time::Duration::from_secs(15)) - { - Result::Ok(_) => { - tracing::info!(actor_id=?self.actor_id, "Log shipper received shutdown"); - break; - } - Err(mpsc::RecvTimeoutError::Disconnected) => { - tracing::error!(actor_id=?self.actor_id, "Log shipper shutdown unexpectedly disconnected"); - break; - } - Err(mpsc::RecvTimeoutError::Timeout) => { - // Not shut down, attempt reconnect - } - } - } - } - } - } - - fn run_inner(&self) -> Result<()> { - tracing::info!(actor_id=?self.actor_id, "Connecting log shipper to Vector at {}", self.vector_socket_addr); - - let mut stream = TcpStream::connect(&self.vector_socket_addr)?; - - tracing::info!(actor_id=?self.actor_id, "Log shipper connected"); - - while let Result::Ok(message) = self.msg_rx.recv() { - let vector_message = VectorMessage::Actors { - actor_id: self.actor_id.to_string(), - env_id: self.env_id, - stream_type: message.stream_type as u8, - ts: message.ts, - message: message.message.as_str(), - }; - - serde_json::to_writer(&mut stream, &vector_message)?; - stream.write_all(b"\n")?; - } - - tracing::info!(actor_id=?self.actor_id, "Log shipper msg_rx disconnected"); - - Ok(()) - } -} - -/// Vector-compatible message format -#[derive(Serialize)] -#[serde(tag = "source")] -enum VectorMessage<'a> { - #[serde(rename = "actors")] - Actors { - actor_id: String, - env_id: Uuid, - stream_type: u8, - ts: u64, - message: &'a str, - }, -} - -/// Spawn a thread to ship logs from a stream to LogShipper -pub fn ship_logs( - actor_id: Uuid, - msg_tx: Option>, - stream_type: StreamType, - stream: impl BufRead + Send + 'static, -) -> JoinHandle<()> { - std::thread::spawn(move || { - // Reduces logging spikes. This logging is in place in order to ensure that a single - // spike of logs does not exhaust the long rate limit. - // - // 64 logs/s - let mut throttle_short = throttle::Throttle::new(960, Duration::from_secs(15)); - - // Reduces logs from noisy games. Set reasonable caps on how - // much can be logged per minute. This is here to prevent games - // that log as fast as possible (i.e. positions of objects every - // tick) from exhausting the system while still allowing sane - // amounts of logging. This happens very frequently. - // - // 4 logs/s * 1024 bytes/log = 4096 bytes/lobby/s = 14.7 MB/lobby/hr = 353.8 MB/lobby/day = 10.6 GB/lobby/month - let mut throttle_long = throttle::Throttle::new(1200, Duration::from_secs(300)); - - // Throttles error logs - let mut throttle_error = throttle::Throttle::new(1, Duration::from_secs(60)); - - // How many lines have been logged as a preview, see `MAX_PREVIEW_LINES` - let mut preview_line_count = 0; - - for line in stream.lines() { - // Throttle - if let Err(err) = throttle_short.tick() { - if err.first_throttle_in_window - && send_message( - actor_id, - &msg_tx, - Some(&mut throttle_error), - stream_type, - format_rate_limit(err.time_remaining), - ) { - break; - } - continue; - } else if let Err(err) = throttle_long.tick() { - if err.first_throttle_in_window { - if send_message( - actor_id, - &msg_tx, - Some(&mut throttle_error), - stream_type, - format_rate_limit(err.time_remaining), - ) { - break; - } - } - continue; - } - - // Read message - let mut message = line.expect("failed to read line"); - - // Truncate message to MAX_LINE_BYTES. This safely truncates to ensure we don't split a - // string on a character boundary. - if let Some((byte_idx, _)) = message - .char_indices() - .find(|&(byte_idx, _)| byte_idx > MAX_LINE_BYTES) - { - message.truncate(byte_idx); - message.push_str(" (truncated)") - } - - // Log preview of lines from the program for easy debugging from Pegboard - if preview_line_count < MAX_PREVIEW_LINES { - preview_line_count += 1; - tracing::info!( - ?actor_id, - "{stream_type:?}: {message}", - stream_type = stream_type, - message = message, - ); - - if preview_line_count == MAX_PREVIEW_LINES { - tracing::warn!( - ?actor_id, - "{stream_type:?}: ...not logging any more lines...", - stream_type = stream_type, - ); - } - } - - if send_message( - actor_id, - &msg_tx, - Some(&mut throttle_error), - stream_type, - message, - ) { - break; - } - } - - tracing::info!(?actor_id, "Ship {stream_type:?} logs thread exiting"); - }) -} - -/// Sends a message to the log shipper -/// -/// Returns true if receiver is disconnected -pub fn send_message( - actor_id: Uuid, - msg_tx: &Option>, - throttle_error: Option<&mut throttle::Throttle>, - stream_type: StreamType, - message: String, -) -> bool { - let Some(msg_tx) = msg_tx.as_ref() else { - return false; - }; - - // Timestamp is formatted in nanoseconds since that's the way it's formatted in - // ClickHouse - let ts = SystemTime::now() - .duration_since(UNIX_EPOCH) - .expect("time went backwards") - .as_nanos() as u64; - - // Attempt to send message. This will fail if the channel is full, relieving back - // pressure if Vector is not running. - match msg_tx.try_send(ReceivedMessage { - stream_type, - ts, - message, - }) { - Result::Ok(_) => {} - Err(mpsc::TrySendError::Full(_)) => { - if throttle_error.map_or(true, |x| x.tick().is_ok()) { - tracing::error!(?actor_id, "log shipper buffer full, logs are being dropped"); - } - } - Err(mpsc::TrySendError::Disconnected(_)) => { - tracing::error!(?actor_id, "log shipper unexpectedly disconnected, exiting"); - return true; - } - } - - false -} - -fn format_rate_limit(duration: Duration) -> String { - format!("...logs rate limited for {} seconds, see rivet.gg/docs/dynamic-servers/concepts/logging...", duration.as_secs()) -} diff --git a/packages/edge/infra/client/isolate-v8-runner/src/metadata.rs b/packages/edge/infra/client/isolate-v8-runner/src/metadata.rs deleted file mode 100644 index 0ab1a51efd..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/src/metadata.rs +++ /dev/null @@ -1,185 +0,0 @@ -use std::collections::HashMap; - -use anyhow::*; -use deno_core::{serde_v8::GlobalValue, v8}; -use pegboard::{protocol, types}; -use pegboard_config::isolate_runner as config; -use rivet_api::models; -use rivet_convert::ApiFrom; -use serde::Serialize; -use uuid::Uuid; - -#[derive(Serialize)] -#[serde(rename_all = "camelCase")] -pub struct JsMetadata { - pub actor: JsMetadataActor, - pub network: Option, - pub project: JsMetadataProject, - pub environment: JsMetadataEnvironment, - pub region: JsMetadataRegion, - pub cluster: JsMetadataCluster, - pub build: JsMetadataBuild, -} - -impl JsMetadata { - pub fn from_actor( - actor_config: config::actor::Config, - scope: &mut v8::HandleScope<'_>, - ) -> Result { - let metadata = actor_config.metadata.deserialize()?; - - Ok(JsMetadata { - actor: JsMetadataActor { - id: metadata.actor.actor_id, - tags: metadata.actor.tags, - created_at: { - let date = v8::Local::from( - v8::Date::new(scope, metadata.actor.create_ts as f64) - .context("bad date")?, - ); - - v8::Global::new(scope, date).into() - }, - }, - network: metadata - .network - .map(|network| { - Ok(JsMetadataNetwork { - ports: network - .ports - .into_iter() - .map(|(name, mut port)| { - // Because the actor's original metadata was created before its ports were allocated, - // we have to modify it to set the ports here. This only applies to host ports. - if let types::Routing::Host { .. } = port.routing { - let transformed_port_name = - pegboard::util::pegboard_normalize_port_name(&name); - - port.public_port = Some( - actor_config - .ports - .get(&transformed_port_name) - .context("no proxied port found for host port")? - .target - .try_into()?, - ); - } - - Ok((name, models::ActorsPort::api_from(port).into())) - }) - .collect::>()?, - }) - }) - .transpose()?, - project: JsMetadataProject { - id: metadata.project.project_id, - slug: metadata.project.slug, - }, - environment: JsMetadataEnvironment { - id: metadata.environment.env_id, - slug: metadata.environment.slug, - }, - region: JsMetadataRegion { - id: metadata.datacenter.name_id, - name: metadata.datacenter.display_name, - }, - cluster: JsMetadataCluster { - id: metadata.cluster.cluster_id, - }, - build: JsMetadataBuild { - id: metadata.build.build_id, - }, - }) - } -} - -#[derive(Serialize)] -#[serde(rename_all = "camelCase")] -pub struct JsMetadataActor { - pub id: Uuid, - pub tags: protocol::HashableMap, - pub created_at: GlobalValue, // v8::Date -} - -#[derive(Serialize)] -#[serde(rename_all = "camelCase")] -pub struct JsMetadataNetwork { - pub ports: HashMap, -} - -#[derive(Serialize)] -#[serde(rename_all = "camelCase")] -pub struct CamelCaseActorsPort { - protocol: models::ActorsPortProtocol, - internal_port: Option, - hostname: Option, - port: Option, - path: Option, - url: Option, - routing: Box, -} - -// Identity conversion from the api model to the camel case struct -impl From for CamelCaseActorsPort { - fn from(value: models::ActorsPort) -> CamelCaseActorsPort { - CamelCaseActorsPort { - protocol: value.protocol, - internal_port: value.internal_port, - hostname: value.hostname, - port: value.port, - path: value.path, - url: value.url, - routing: value.routing, - } - } -} - -#[derive(Serialize)] -#[serde(rename_all = "camelCase")] -pub struct CamelCaseActorsPortRouting { - pub guard: Option, - pub host: Option, -} - -// Identity conversion from the api model to the camel case struct -impl From for CamelCaseActorsPortRouting { - fn from(value: models::ActorsPortRouting) -> CamelCaseActorsPortRouting { - CamelCaseActorsPortRouting { - guard: value.guard, - host: value.host, - } - } -} - -#[derive(Serialize)] -#[serde(rename_all = "camelCase")] -pub struct JsMetadataProject { - pub id: Uuid, - pub slug: String, -} - -#[derive(Serialize)] -#[serde(rename_all = "camelCase")] -pub struct JsMetadataEnvironment { - pub id: Uuid, - pub slug: String, -} - -#[derive(Serialize)] -#[serde(rename_all = "camelCase")] -pub struct JsMetadataRegion { - pub id: String, - pub name: String, -} - -#[derive(Serialize)] -#[serde(rename_all = "camelCase")] -pub struct JsMetadataCluster { - pub id: Uuid, -} - -#[derive(Serialize)] -#[serde(rename_all = "camelCase")] -pub struct JsMetadataBuild { - pub id: Uuid, -} diff --git a/packages/edge/infra/client/isolate-v8-runner/src/throttle.rs b/packages/edge/infra/client/isolate-v8-runner/src/throttle.rs deleted file mode 100644 index ace29ce3ef..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/src/throttle.rs +++ /dev/null @@ -1,55 +0,0 @@ -use std::time::{Duration, Instant}; - -/// Utility for rate limiting logs. -/// -/// Logs are rate limited within this code instead of Vector because the earlier we drop logs, the -/// more resources it saves. -pub struct Throttle { - threshold: usize, - window: Duration, - - window_start: Instant, - count: usize, -} - -impl Throttle { - pub fn new(threshold: usize, window: Duration) -> Self { - Throttle { - threshold, - window, - window_start: Instant::now(), - count: 0, - } - } - - pub fn tick(&mut self) -> Result<(), TickError> { - // Reset window - if self.window_start.elapsed() > self.window { - self.window_start = Instant::now(); - self.count = 0; - } - - // Count - // - // Do this before error in order to determine if first throttle - self.count += 1; - - // Throttle - if self.count > self.threshold { - return Err(TickError { - time_remaining: self.window - self.window_start.elapsed(), - first_throttle_in_window: self.count == self.threshold + 1, - }); - } - - Ok(()) - } -} - -pub struct TickError { - /// How much time is remaining in this window - pub time_remaining: Duration, - - /// The first throttle in this time window - pub first_throttle_in_window: bool, -} diff --git a/packages/edge/infra/client/isolate-v8-runner/src/utils.rs b/packages/edge/infra/client/isolate-v8-runner/src/utils.rs deleted file mode 100644 index 9ca179cd04..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/src/utils.rs +++ /dev/null @@ -1,162 +0,0 @@ -use std::{ops::Deref, path::Path, result::Result::Ok, sync::Arc}; - -use ::tokio::fs; -use anyhow::*; -use foundationdb as fdb; -use pegboard_config::isolate_runner::Config; -use service_discovery::ServiceDiscovery; - -// TODO: Copied from rivet_pools -#[derive(Clone)] -pub struct FdbPool { - db: Arc, - _sd: Option>, - // Prevent dropping temp file - _connection_file: Arc, -} - -impl Deref for FdbPool { - type Target = Arc; - - fn deref(&self) -> &Self::Target { - &self.db - } -} - -#[tracing::instrument(skip(config))] -pub async fn setup_fdb_pool(config: &Config) -> Result { - let temp_file = tempfile::NamedTempFile::new()?; - let temp_path = temp_file.path().to_path_buf(); - - let fdb_config = config.foundationdb.clone(); - - let sd = match &fdb_config.addresses { - pegboard_config::Addresses::Dynamic { fetch_endpoint } => { - let sd = ServiceDiscovery::new(fetch_endpoint.clone()); - - // Initial fetch - let servers = sd.fetch().await.context("failed to fetch services")?; - let joined = servers - .into_iter() - .filter_map(|server| server.lan_ip) - .map(|lan_ip| format!("{lan_ip}:4500")) - .collect::>() - .join(","); - write_connection_file(&fdb_config, &temp_path, &joined).await?; - - sd.start(move |servers| { - let temp_path = temp_path.clone(); - let fdb_config = fdb_config.clone(); - async move { - let joined = servers - .into_iter() - .filter_map(|server| server.lan_ip) - .map(|lan_ip| format!("{lan_ip}:4500")) - .collect::>() - .join(","); - - write_connection_file(&fdb_config, &temp_path, &joined).await?; - - anyhow::Ok(()) - } - }); - - Some(sd) - } - pegboard_config::Addresses::Static(addresses) => { - let joined = addresses.join(","); - write_connection_file(&fdb_config, &temp_path, &joined).await?; - - None - } - }; - - // Start network - fdb_util::init(temp_file.path()); - - let fdb_handle = fdb_util::handle(&temp_file.path())?; - - tracing::debug!(config_file_path=%temp_file.path().display(), "fdb started"); - - Ok(FdbPool { - db: Arc::new(fdb_handle), - _sd: sd, - _connection_file: Arc::new(temp_file), - }) -} - -async fn write_connection_file( - fdb_config: &pegboard_config::FoundationDb, - temp_path: &Path, - joined: &str, -) -> Result<(), std::io::Error> { - let connection = format!( - "{cluster_description}:{cluster_id}@{joined}", - cluster_description = fdb_config.cluster_description, - cluster_id = fdb_config.cluster_id, - ); - - fs::write(temp_path, connection.as_bytes()).await?; - - Ok(()) -} - -pub mod tokio { - use anyhow::*; - use deno_core::unsync::MaskFutureAsSend; - - // Copied from deno-runtime tokio_util.rs - fn create_basic_runtime() -> Result<::tokio::runtime::Runtime> { - let event_interval = 61; - let global_queue_interval = 31; - let max_io_events_per_tick = 1024; - - ::tokio::runtime::Builder::new_current_thread() - .enable_io() - .enable_time() - .event_interval(event_interval) - .global_queue_interval(global_queue_interval) - .max_io_events_per_tick(max_io_events_per_tick) - // This limits the number of threads for blocking operations (like for - // synchronous fs ops) or CPU bound tasks like when we run dprint in - // parallel for deno fmt. - // The default value is 512, which is an unhelpfully large thread pool. We - // don't ever want to have more than a couple dozen threads. - .max_blocking_threads(32) - .build() - .map_err(Into::into) - } - - // Copied from deno-runtime tokio_util.rs - #[inline(always)] - pub fn create_and_run_current_thread(future: F) -> Result - where - F: std::future::Future + 'static, - R: Send + 'static, - { - let rt = create_basic_runtime()?; - - // Since this is the main future, we want to box it in debug mode because it tends to be fairly - // large and the compiler won't optimize repeated copies. We also make this runtime factory - // function #[inline(always)] to avoid holding the unboxed, unused future on the stack. - - #[cfg(debug_assertions)] - // SAFETY: this is guaranteed to be running on a current-thread executor - let future = Box::pin(unsafe { MaskFutureAsSend::new(future) }); - - #[cfg(not(debug_assertions))] - // SAFETY: this is guaranteed to be running on a current-thread executor - let future = unsafe { MaskFutureAsSend::new(future) }; - - let join_handle = rt.spawn(future); - - let r = rt.block_on(join_handle)?.into_inner(); - // Forcefully shutdown the runtime - we're done executing JS code at this - // point, but there might be outstanding blocking tasks that were created and - // latered "unrefed". They won't terminate on their own, so we're forcing - // termination of Tokio runtime at this point. - rt.shutdown_background(); - - Ok(r) - } -} diff --git a/packages/edge/infra/client/isolate-v8-runner/tests/index.js b/packages/edge/infra/client/isolate-v8-runner/tests/index.js deleted file mode 100644 index b0726d855a..0000000000 --- a/packages/edge/infra/client/isolate-v8-runner/tests/index.js +++ /dev/null @@ -1,24 +0,0 @@ -// Used by the test in isolate.rs - -export default { - async start(ctx) { - console.log(ctx); - - await ctx.kv.putBatch( - new Map([ - [["foob", "b"], 12], - [["foob", "a"], null], - [["foob", "c"], true], - ]), - ); - - const res = await ctx.kv.list({ prefix: ["foob"] }); - - console.log(res.array(), res.raw(), res.entries()); - console.log(res.get(["foob", "b"])); - - Deno.exit(2); - - throw new Error("bingus"); - }, -}; diff --git a/packages/edge/infra/client/manager/Cargo.toml b/packages/edge/infra/client/manager/Cargo.toml index e23ac05b56..c485ac240a 100644 --- a/packages/edge/infra/client/manager/Cargo.toml +++ b/packages/edge/infra/client/manager/Cargo.toml @@ -34,6 +34,7 @@ serde = { version = "1.0.195", features = ["derive"] } serde_json = "1.0.111" serde_yaml = "0.9.34" service-discovery.workspace = true +strum = { version = "0.26", features = ["derive"] } sysinfo = "0.31.4" tempfile = "3.2" thiserror = "1.0" diff --git a/packages/edge/infra/client/manager/src/actor/mod.rs b/packages/edge/infra/client/manager/src/actor/mod.rs index 8dfae8c454..2e08046510 100644 --- a/packages/edge/infra/client/manager/src/actor/mod.rs +++ b/packages/edge/infra/client/manager/src/actor/mod.rs @@ -1,38 +1,22 @@ use std::{ result::Result::{Err, Ok}, sync::Arc, - time::{Duration, Instant}, }; use anyhow::*; use indoc::indoc; -use nix::sys::signal::Signal; +use nix::{sys::signal::Signal, unistd::Pid}; use pegboard::protocol; use pegboard_config::runner_protocol; -use sqlx::Acquire; -use tokio::{fs, sync::Mutex}; use uuid::Uuid; use crate::{ctx::Ctx, runner, utils}; -mod oci_config; -mod partial_oci_config; -mod seccomp; -mod setup; - -/// How often to check for a PID when one is not present and a stop command was received. -const STOP_PID_INTERVAL: Duration = std::time::Duration::from_millis(250); -/// How many times to check for a PID when a stop command was received. -const STOP_PID_RETRIES: usize = 1024; - pub struct Actor { actor_id: Uuid, generation: u32, config: protocol::ActorConfig, - metadata: protocol::ActorMetadata, - - runner: Mutex>, - exited: Mutex, + runner: Arc, } impl Actor { @@ -40,34 +24,13 @@ impl Actor { actor_id: Uuid, generation: u32, config: protocol::ActorConfig, - metadata: protocol::ActorMetadata, + runner: Arc, ) -> Arc { Arc::new(Actor { actor_id, generation, config, - metadata, - - runner: Mutex::new(None), - exited: Mutex::new(false), - }) - } - - pub fn with_runner( - actor_id: Uuid, - generation: u32, - config: protocol::ActorConfig, - metadata: protocol::ActorMetadata, - runner: runner::Handle, - ) -> Arc { - Arc::new(Actor { - actor_id, - generation, - config, - metadata, - - runner: Mutex::new(Some(runner)), - exited: Mutex::new(false), + runner, }) } @@ -113,18 +76,15 @@ impl Actor { let self2 = self.clone(); let ctx2 = ctx.clone(); tokio::spawn(async move { - match self2.setup(&ctx2).await { - Ok(proxied_ports) => match self2.run(&ctx2, proxied_ports).await { - Ok(_) => { - if let Err(err) = self2.observe(&ctx2).await { - tracing::error!(actor_id=?self2.actor_id, ?err, "observe failed"); - } - } - Err(err) => { - tracing::error!(actor_id=?self2.actor_id, ?err, "run failed") + match self2.run(&ctx2).await { + Ok(observers) => { + if let Err(err) = self2.observe(&ctx2, observers).await { + tracing::error!(actor_id=?self2.actor_id, ?err, "observe failed"); } - }, - Err(err) => tracing::error!(actor_id=?self2.actor_id, ?err, "setup failed"), + } + Err(err) => { + tracing::error!(actor_id=?self2.actor_id, ?err, "run failed") + } } // Cleanup afterwards @@ -136,227 +96,107 @@ impl Actor { Ok(()) } - async fn setup( - self: &Arc, - ctx: &Arc, - ) -> Result> { - let setup_timer = Instant::now(); - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "setting up actor"); - - let actor_path = ctx.actor_path(self.actor_id, self.generation); + async fn run(self: &Arc, ctx: &Arc) -> Result> { + tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "running"); - // Create actor working dir - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "creating actor working directory"); - fs::create_dir(&actor_path) - .await - .context("failed to create actor dir")?; - - // Determine ahead of time if we need to set up CNI network - let needs_cni_network = - matches!( - self.config.image.kind, - protocol::ImageKind::DockerImage | protocol::ImageKind::OciBundle - ) && matches!(self.config.network_mode, protocol::NetworkMode::Bridge); - - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "starting parallel setup tasks"); - let parallel_timer = Instant::now(); - - let (_, ports) = tokio::try_join!( - async { - self.download_image(&ctx).await?; - self.make_fs(&ctx).await - }, - async { - let ports = self.bind_ports(ctx).await?; - if needs_cni_network { - self.setup_cni_network(&ctx, &ports).await?; - } + // NOTE: Create actor observer before sending the start actor message to prevent a race + // condition + let actor_observer = match self.runner.config().image.allocation_type { + protocol::ImageAllocationType::Single => None, + protocol::ImageAllocationType::Multi => Some( + self.runner + .new_actor_observer(self.actor_id, self.generation), + ), + }; - Ok(ports) - } - )?; - - let parallel_duration = parallel_timer.elapsed().as_secs_f64(); - crate::metrics::SETUP_PARALLEL_TASKS_DURATION.observe(parallel_duration); - tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, - duration_seconds=parallel_duration, - "parallel setup tasks completed" - ); - - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "setting up runtime environment"); - match self.config.image.kind { - protocol::ImageKind::DockerImage | protocol::ImageKind::OciBundle => { - self.setup_oci_bundle(&ctx, &ports).await?; - } - protocol::ImageKind::JavaScript => self.setup_isolate(&ctx, &ports).await?, - } + match self + .config + .runner + .as_ref() + .context("should have runner config")? + { + protocol::ActorRunner::New { .. } => { + // Because the runner is not already started we can get the ports here instead of reading from + // sqlite + let ports = self.runner.start(ctx).await?; - let duration = setup_timer.elapsed().as_secs_f64(); - crate::metrics::SETUP_TOTAL_DURATION.observe(duration); - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, duration_seconds=duration, "actor setup completed"); + let pid = self.runner.pid().await?; - Ok(ports) - } + tracing::info!(actor_id=?self.actor_id, generation=?self.generation, ?pid, "pid received"); - async fn run( - self: &Arc, - ctx: &Arc, - ports: protocol::HashableMap, - ) -> Result<()> { - tracing::info!(actor_id=?self.actor_id, env_id=?self.metadata.environment.env_id, generation=?self.generation, "spawning"); - - let mut runner_env = vec![ - ( - "ROOT_USER_ENABLED", - if self.config.root_user_enabled { - "1" - } else { - "0" - } - .to_string(), - ), - ("ACTOR_ID", self.actor_id.to_string()), - ( - "ENVIRONMENT_ID", - self.metadata.environment.env_id.to_string(), - ), - ]; - if let Some(vector) = &ctx.config().vector { - runner_env.push(("VECTOR_SOCKET_ADDR", vector.address.to_string())); - } - - let runner = match self.config.image.kind { - // Spawn runner which spawns the container - protocol::ImageKind::DockerImage | protocol::ImageKind::OciBundle => { - runner::Handle::spawn_orphaned( - runner::Comms::Basic, - &ctx.config().runner.container_runner_binary_path(), - &pegboard_config::utils::format_container_id( - &self.actor_id.to_string(), - self.generation, - ), - ctx.actor_path(self.actor_id, self.generation), - &runner_env, - )? + match self.runner.config().image.allocation_type { + protocol::ImageAllocationType::Single => { + self.set_running(ctx, pid, ports).await? + } + protocol::ImageAllocationType::Multi => { + self.runner + .send(&runner_protocol::ToRunner::StartActor { + actor_id: self.actor_id, + generation: self.generation, + env: self.config.env.clone(), + metadata: self.config.metadata.clone(), + }) + .await?; + } + }; } - // Shared runner - protocol::ImageKind::JavaScript => { - let runner = ctx.get_or_spawn_isolate_runner().await?; - - runner - .send(&runner_protocol::ToRunner::Start { - actor_id: self.actor_id, - generation: self.generation, - }) - .await?; - - runner + protocol::ActorRunner::Existing { .. } => { + match self.runner.config().image.allocation_type { + protocol::ImageAllocationType::Single => { + unimplemented!( + "allocating new actor to an existing `Single` allocation_type runner" + ); + } + protocol::ImageAllocationType::Multi => { + self.runner + .send(&runner_protocol::ToRunner::StartActor { + actor_id: self.actor_id, + generation: self.generation, + env: self.config.env.clone(), + metadata: self.config.metadata.clone(), + }) + .await?; + } + }; } - }; - let pid = runner.pid().clone(); - - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, ?pid, "pid received"); - - // Store runner - { - *self.runner.lock().await = Some(runner); } - // Update DB - utils::sql::query(|| async { - sqlx::query(indoc!( - " - UPDATE actors - SET - running_ts = ?3, - pid = ?4 - WHERE - actor_id = ?1 AND - generation = ?2 - ", - )) - .bind(self.actor_id) - .bind(self.generation as i64) - .bind(utils::now()) - .bind(pid.as_raw()) - .execute(&mut *ctx.sql().await?) - .await - }) - .await?; - - ctx.event(protocol::Event::ActorStateUpdate { - actor_id: self.actor_id, - generation: self.generation, - state: protocol::ActorState::Running { - pid: pid.as_raw().try_into()?, - ports, - }, - }) - .await?; - - Ok(()) + Ok(actor_observer) } // Watch actor for updates - pub(crate) async fn observe(&self, ctx: &Arc) -> Result<()> { + pub(crate) async fn observe( + &self, + ctx: &Arc, + actor_observer: Option, + ) -> Result<()> { tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "observing"); - let Some(runner) = ({ (*self.runner.lock().await).clone() }) else { - bail!("actor does not have a runner to observe yet"); - }; - - let exit_code = match self.config.image.kind { - protocol::ImageKind::DockerImage | protocol::ImageKind::OciBundle => { - runner.observe().await? - } - // With isolates we have to check if the shared isolate runner exited and if the isolate itself - // exited - protocol::ImageKind::JavaScript => { - let actor_path = ctx.actor_path(self.actor_id, self.generation); - let exit_code_path = actor_path.join("exit-code"); - + let exit_code = if let Some(mut actor_observer) = actor_observer { + loop { tokio::select! { - res = runner.observe() => res?, - res = utils::wait_for_write(&exit_code_path) => { - res?; - - let exit_code = match fs::read_to_string(&exit_code_path).await { - Ok(contents) => if contents.trim().is_empty() { - // File exists but is empty. This is explicit - None - } else { - match contents.trim().parse::() { - Ok(x) => Some(x), - Err(err) => { - tracing::error!( - actor_id=?self.actor_id, - generation=?self.generation, - ?err, - "failed to parse exit code file", - ); - - None - } - } - }, - Err(err) => { - tracing::error!( - actor_id=?self.actor_id, - generation=?self.generation, - ?err, - "failed to read exit code file", - ); - - None - } - }; - - exit_code + // We have to check if the shared runner exited or if the actor exited + res = self.runner.observe(ctx, true) => break res?, + res = actor_observer.next() => match res { + Some(runner_protocol::ActorState::Running) => { + tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "actor running"); + + let (pid, ports) = tokio::try_join!( + self.runner.pid(), + self.runner.ports(ctx), + )?; + + self.set_running(ctx, pid, ports).await?; + }, + Some(runner_protocol::ActorState::Exited { + exit_code, + }) => break exit_code, + None => break None, }, } } + } else { + self.runner.observe(ctx, true).await? }; self.set_exit_code(ctx, exit_code).await?; @@ -391,60 +231,13 @@ impl Actor { signal: Signal, persist_storage: bool, ) -> Result<()> { - let mut i = 0; - - // Signal command might be sent before the actor has a runner. This loop waits for the runner to start - let runner_guard = loop { - { - let runner_guard = self.runner.lock().await; - if runner_guard.is_some() { - break Some(runner_guard); - } - } - - if *self.exited.lock().await { - tracing::warn!( - actor_id=?self.actor_id, - generation=?self.generation, - "actor exited before PID was set, ignoring signal", - ); - - break None; - } - - // Progress log - if i % 10 == 0 { - tracing::warn!( - actor_id=?self.actor_id, - generation=?self.generation, - "waiting for PID to signal actor", - ); - } - - if i > STOP_PID_RETRIES { - tracing::error!( - actor_id=?self.actor_id, - "timed out waiting for actor to get PID, considering actor stopped", - ); - - break None; - } - - i += 1; - - tokio::time::sleep(STOP_PID_INTERVAL).await; - }; - - let has_runner = runner_guard.is_some(); - - // Kill if runner exists - if let Some(runner) = runner_guard { - let runner = &*runner.as_ref().expect("must exist"); + let has_pid = self.runner.pid().await.is_ok(); + if has_pid { // Send message - if runner.has_socket() { - runner - .send(&runner_protocol::ToRunner::Signal { + if self.runner.has_socket() { + self.runner + .send(&runner_protocol::ToRunner::SignalActor { actor_id: self.actor_id, generation: self.generation, signal: signal as i32, @@ -454,12 +247,12 @@ impl Actor { } // Send signal else { - runner.signal(signal)?; + self.runner.signal(signal).await?; } } // Update stop_ts - if matches!(signal, Signal::SIGTERM | Signal::SIGKILL) || !has_runner { + if matches!(signal, Signal::SIGTERM | Signal::SIGKILL) || !has_pid { let stop_ts_set = utils::sql::query(|| async { let mut conn = ctx.sql().await?; let mut tx = conn.begin().await?; @@ -515,22 +308,18 @@ impl Actor { } #[tracing::instrument(skip_all)] - pub async fn set_exit_code(&self, ctx: &Ctx, exit_code: Option) -> Result<()> { - let mut guard = self.exited.lock().await; - - // Already exited - if *guard { - return Ok(()); - } - + async fn set_running( + &self, + ctx: &Ctx, + pid: Pid, + ports: protocol::HashableMap, + ) -> Result<()> { // Update DB utils::sql::query(|| async { sqlx::query(indoc!( " UPDATE actors - SET - exit_ts = ?3, - exit_code = ?4 + SET running_ts = ?3 WHERE actor_id = ?1 AND generation = ?2 @@ -539,31 +328,55 @@ impl Actor { .bind(self.actor_id) .bind(self.generation as i64) .bind(utils::now()) - .bind(exit_code) .execute(&mut *ctx.sql().await?) .await }) .await?; - // Unbind ports - utils::sql::query(|| async { - sqlx::query(indoc!( + ctx.event(protocol::Event::ActorStateUpdate { + actor_id: self.actor_id, + generation: self.generation, + state: protocol::ActorState::Running { + pid: pid.as_raw().try_into()?, + ports, + }, + }) + .await?; + + Ok(()) + } + + #[tracing::instrument(skip_all)] + pub async fn set_exit_code(&self, ctx: &Ctx, exit_code: Option) -> Result<()> { + // Update DB + let row = utils::sql::query(|| async { + sqlx::query_as::<_, (bool,)>(indoc!( " - UPDATE actor_ports - SET delete_ts = ?3 + UPDATE actors + SET + exit_ts = ?3, + exit_code = ?4 WHERE actor_id = ?1 AND - generation = ?2 + generation = ?2 AND + exit_ts IS NULL + RETURNING 1 ", )) .bind(self.actor_id) .bind(self.generation as i64) .bind(utils::now()) - .execute(&mut *ctx.sql().await?) + .bind(exit_code) + .fetch_optional(&mut *ctx.sql().await?) .await }) .await?; + // Already exited + if row.is_none() { + return Ok(()); + } + ctx.event(protocol::Event::ActorStateUpdate { actor_id: self.actor_id, generation: self.generation, @@ -571,21 +384,16 @@ impl Actor { }) .await?; - *guard = true; - Ok(()) } #[tracing::instrument(skip_all)] pub async fn cleanup(&self, ctx: &Ctx) -> Result<()> { - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "cleaning up"); + tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "cleaning up actor"); // Set exit code if it hasn't already been set self.set_exit_code(ctx, None).await?; - // Cleanup setup. Should only be called after the exit code is set successfully for consistent state - self.cleanup_setup(ctx).await; - // It is important that we remove from the actors list last so that we prevent duplicates { let mut actors = ctx.actors.write().await; diff --git a/packages/edge/infra/client/manager/src/ctx.rs b/packages/edge/infra/client/manager/src/ctx.rs index 54d44d5211..c1790a3f16 100644 --- a/packages/edge/infra/client/manager/src/ctx.rs +++ b/packages/edge/infra/client/manager/src/ctx.rs @@ -1,6 +1,5 @@ use std::{ collections::HashMap, - net::SocketAddr, path::PathBuf, result::Result::{Err, Ok}, sync::Arc, @@ -13,11 +12,13 @@ use futures_util::{ SinkExt, StreamExt, }; use indoc::indoc; -use nix::{sys::signal::Signal, unistd::Pid}; -use pegboard::{protocol, system_info::SystemInfo}; -use pegboard_config::{ - isolate_runner::Config as IsolateRunnerConfig, runner_protocol, Client, Config, +use nix::{ + errno::Errno, + sys::signal::{kill, Signal}, + unistd::Pid, }; +use pegboard::{protocol, system_info::SystemInfo}; +use pegboard_config::{runner_protocol, Client, Config}; use sqlx::{pool::PoolConnection, Acquire, Sqlite, SqlitePool}; use tokio::{ fs, @@ -38,12 +39,19 @@ use crate::{ actor::Actor, event_sender::EventSender, image_download_handler::ImageDownloadHandler, - metrics, runner, + metrics, + runner::{self, Runner}, utils::{self, sql::SqlitePoolExt}, }; const PING_INTERVAL: Duration = Duration::from_secs(1); const ACK_INTERVAL: Duration = Duration::from_secs(60 * 5); +/// How long before killing a runner that hasn't sent an init packet. +const RUNNER_INIT_TIMEOUT: Duration = Duration::from_secs(5); +/// How often to check for the actor's runner to start. +const GET_RUNNER_INTERVAL: Duration = std::time::Duration::from_millis(250); +/// How many times to check for the actor's runner to start. +const GET_RUNNER_RETRIES: usize = 32; #[derive(thiserror::Error, Debug)] pub enum RuntimeError { @@ -67,8 +75,14 @@ struct ActorRow { actor_id: Uuid, generation: i64, config: Vec, +} + +#[derive(sqlx::FromRow)] +struct RunnerRow { + runner_id: Uuid, + comms: i64, + config: Vec, pid: Option, - stop_ts: Option, } pub struct Ctx { @@ -82,8 +96,8 @@ pub struct Ctx { event_sender: EventSender, pub(crate) image_download_handler: ImageDownloadHandler, + pub(crate) runners: RwLock>>, pub(crate) actors: RwLock>>, - isolate_runner: RwLock>, } impl Ctx { @@ -102,8 +116,8 @@ impl Ctx { event_sender: EventSender::new(), image_download_handler: ImageDownloadHandler::new(), + runners: RwLock::new(HashMap::new()), actors: RwLock::new(HashMap::new()), - isolate_runner: RwLock::new(None), }) } @@ -178,50 +192,6 @@ impl Ctx { self: &Arc, mut rx: SplitStream>>, ) -> Result<()> { - // Rebuild isolate runner from db before starting runner socket - self.rebuild_isolate_runner().await?; - - // Start runner socket - let self2 = self.clone(); - let runner_socket: tokio::task::JoinHandle> = tokio::spawn(async move { - tracing::info!(port=%self2.config().runner.port(), "listening for runner sockets"); - - let listener = TcpListener::bind(("0.0.0.0", self2.config().runner.port())) - .await - .map_err(RuntimeError::RunnerSocketListenFailed)?; - - loop { - match listener.accept().await { - Ok((stream, _)) => { - let mut ws_stream = tokio_tungstenite::accept_async(stream).await?; - - tracing::info!("received new socket"); - - if let Some(runner) = &*self2.isolate_runner.read().await { - runner.attach_socket(ws_stream).await?; - } else { - tracing::error!("killing unknown runner"); - - metrics::UNKNOWN_ISOLATE_RUNNER.with_label_values(&[]).inc(); - - ws_stream - .send(Message::Binary(serde_json::to_vec( - &runner_protocol::ToRunner::Terminate, - )?)) - .await?; - - let close_frame = CloseFrame { - code: CloseCode::Error, - reason: "unknown runner".into(), - }; - ws_stream.send(Message::Close(Some(close_frame))).await?; - } - } - Err(err) => tracing::error!(?err, "failed to connect websocket"), - } - } - }); - // Send init packet { let (last_command_idx, last_workflow_id) = utils::sql::query(|| async { @@ -246,6 +216,44 @@ impl Ctx { self.receive_init(&mut rx).await?; + // Start runner socket and attaches incoming connections to their corresponding runner + let self2 = self.clone(); + let runner_socket: tokio::task::JoinHandle> = tokio::spawn(async move { + tracing::info!(port=%self2.config().runner.port(), "listening for runner sockets"); + + let listener = + TcpListener::bind((self2.config().runner.ip(), self2.config().runner.port())) + .await + .map_err(RuntimeError::RunnerSocketListenFailed)?; + + loop { + match listener.accept().await { + Ok((stream, _)) => { + let mut ws_stream = Some(tokio_tungstenite::accept_async(stream).await?); + + tracing::info!("received new socket"); + + if let Err(err) = self2.receive_runner_init_message(&mut ws_stream).await { + tracing::error!( + ?err, + "failed to receive init message from runner socket" + ); + } + + // Close stream + if let Some(mut ws_stream) = ws_stream { + let close_frame = CloseFrame { + code: CloseCode::Error, + reason: "init failed".into(), + }; + ws_stream.send(Message::Close(Some(close_frame))).await?; + } + } + Err(err) => tracing::error!(?err, "failed to connect websocket"), + } + } + }); + // Start ping thread after init packet is received because ping denotes this client as "ready" let self2 = self.clone(); let ping_thread: tokio::task::JoinHandle> = tokio::spawn(async move { @@ -399,10 +407,7 @@ impl Ctx { tracing::debug!(?packet, "received packet"); match packet { - protocol::ToClient::Init { .. } => { - metrics::SECOND_INIT.with_label_values(&[]).inc(); - bail!("unexpected second init packet"); - } + protocol::ToClient::Init { .. } => bail!("unexpected second init packet"), protocol::ToClient::Commands(commands) => { for command in commands { self.process_command(command).await?; @@ -432,15 +437,39 @@ impl Ctx { "actor with this actor id + generation already exists, ignoring start command", ); } else { - let actor = Actor::new(actor_id, generation, *config, metadata); + if let Some(runner) = self + .get_or_create_runner( + config + .runner + .as_ref() + .context("runner config should exist")?, + ) + .await? + { + let actor = Actor::new(actor_id, generation, *config, runner); - // Insert actor - actors.insert((actor_id, generation), actor); + // Insert actor + actors.insert((actor_id, generation), actor); - let actor = actors.get(&(actor_id, generation)).context("unreachable")?; + let actor = actors.get(&(actor_id, generation)).context("unreachable")?; - // Spawn actor - actor.start(&self).await?; + // Spawn actor + actor.start(&self).await?; + } else { + tracing::error!( + ?actor_id, + ?generation, + runner=?config.runner, + "timed out waiting for actor's runner to be inserted, considering actor lost", + ); + + self.event(protocol::Event::ActorStateUpdate { + actor_id: actor_id, + generation: generation, + state: protocol::ActorState::Lost, + }) + .await?; + } } } protocol::Command::SignalActor { @@ -461,6 +490,16 @@ impl Ctx { ); } } + protocol::Command::SignalRunner { runner_id, signal } => { + if let Some(runner) = self.runners.read().await.get(&runner_id) { + runner.signal(signal.try_into()?).await?; + } else { + tracing::warn!( + ?runner_id, + "received stop runner command for runner that doesn't exist (likely already stopped)" + ); + } + } } // Ack command @@ -498,111 +537,46 @@ impl Ctx { Ok(()) } -} - -// MARK: Isolate runner -impl Ctx { - pub(crate) async fn get_or_spawn_isolate_runner(self: &Arc) -> Result { - let mut guard = self.isolate_runner.write().await; - if let Some(runner) = &*guard { - Ok(runner.clone()) - } else { - tracing::info!("spawning new isolate runner"); + /// Returns None if the runner could not be found in the runners map on time. + async fn get_or_create_runner( + &self, + runner: &protocol::ActorRunner, + ) -> Result>> { + match runner { + protocol::ActorRunner::New { runner_id, config } => { + let mut runners = self.runners.write().await; - let working_path = self.isolate_runner_path(); + let comms = match &config.image.allocation_type { + protocol::ImageAllocationType::Single => runner::Comms::Basic, + protocol::ImageAllocationType::Multi => runner::Comms::socket(), + }; - let config = IsolateRunnerConfig { - actors_path: self.actors_path(), - manager_ws_addr: SocketAddr::from(([127, 0, 0, 1], self.config().runner.port())), - foundationdb: self.config.client.foundationdb.clone(), - }; + let runner = Arc::new(Runner::new(*runner_id, comms, config.clone())); + runners.insert(*runner_id, runner.clone()); - // Delete existing exit code - if let Err(err) = fs::remove_file(working_path.join("exit-code")).await { - if err.kind() != std::io::ErrorKind::NotFound { - return Err(err.into()); - } + Ok(Some(runner)) } + protocol::ActorRunner::Existing { runner_id } => { + let mut i = 0; + + loop { + { + let runners_guard = self.runners.read().await; + if let Some(runner) = runners_guard.get(runner_id) { + break Ok(Some(runner.clone())); + } + } - // Write isolate runner config - fs::write( - working_path.join("config.json"), - serde_json::to_vec(&config)?, - ) - .await?; - - let runner = runner::Handle::spawn_orphaned( - runner::Comms::socket(), - &self.config().runner.isolate_runner_binary_path(), - "", - working_path, - &[], - )?; - let pid = runner.pid(); - - self.observe_isolate_runner(&runner); - - // Save runner pid - utils::sql::query(|| async { - sqlx::query(indoc!( - " - UPDATE state - SET isolate_runner_pid = ?1 - ", - )) - .bind(pid.as_raw()) - .execute(&mut *self.sql().await?) - .await - }) - .await?; - - *guard = Some(runner.clone()); - - Ok(runner) - } - } - - fn observe_isolate_runner(self: &Arc, runner: &runner::Handle) { - tracing::info!(pid=?runner.pid(), "observing isolate runner"); + if i > GET_RUNNER_RETRIES { + break Ok(None); + } + i += 1; - // Observe runner - let self2 = self.clone(); - let runner2 = runner.clone(); - tokio::spawn(async move { - let exit_code = match runner2.observe().await { - Ok(exit_code) => exit_code, - Err(err) => { - // TODO: This should hard error the manager - tracing::error!(%err, "failed to observe isolate runner"); - return; + tokio::time::sleep(GET_RUNNER_INTERVAL).await; } - }; - - tracing::error!(pid=?runner2.pid(), ?exit_code, "isolate runner exited"); - - // Update in-memory state - let mut guard = self2.isolate_runner.write().await; - *guard = None; - - // Update db state - let res = utils::sql::query(|| async { - sqlx::query(indoc!( - " - UPDATE state - SET isolate_runner_pid = NULL - ", - )) - .execute(&mut *self2.sql().await?) - .await - }) - .await; - - if let Err(err) = res { - // TODO: This should hard error the manager - tracing::error!(%err, "failed to write isolate runner"); } - }); + } } fn prewarm_image(self: &Arc, image_config: protocol::Image) { @@ -632,42 +606,9 @@ impl Ctx { // MARK: State re-initialization impl Ctx { - /// Fetches isolate runner state from the db. Should be called before the manager's runner websocket opens. - async fn rebuild_isolate_runner(self: &Arc) -> Result<()> { - let (isolate_runner_pid,) = utils::sql::query(|| async { - sqlx::query_as::<_, (Option,)>(indoc!( - " - SELECT isolate_runner_pid - FROM state - ", - )) - .fetch_one(&mut *self.sql().await?) - .await - }) - .await?; - - // Recreate isolate runner handle - if let Some(isolate_runner_pid) = isolate_runner_pid { - let mut guard = self.isolate_runner.write().await; - - tracing::info!(?isolate_runner_pid, "found existing isolate runner"); - - let runner = runner::Handle::from_pid( - runner::Comms::socket(), - Pid::from_raw(isolate_runner_pid), - self.isolate_runner_path(), - ); - self.observe_isolate_runner(&runner); - - *guard = Some(runner); - } - - Ok(()) - } - /// Destroys all active actors and runners and resets the database. async fn reset(self: &Arc, workflow_id: Uuid) -> Result<()> { - let ((last_workflow_id,), actor_rows) = tokio::try_join!( + let ((last_workflow_id,), runner_rows) = tokio::try_join!( // There should not be any database operations going on at this point so it is safe to read this // value utils::sql::query(|| async { @@ -680,16 +621,16 @@ impl Ctx { .await }), utils::sql::query(|| async { - sqlx::query_as::<_, ActorRow>(indoc!( + sqlx::query_as::<_, RunnerRow>(indoc!( " - SELECT actor_id, generation, config, pid, stop_ts - FROM actors + SELECT runner_id, comms, config, pid + FROM runners WHERE exit_ts IS NULL ", )) .fetch_all(&mut *self.sql().await?) .await - }) + }), )?; let Some(last_workflow_id) = last_workflow_id else { @@ -706,41 +647,19 @@ impl Ctx { "manager is resetting due to a workflow change" ); - let isolate_runner = { self.isolate_runner.read().await.clone() }; - - // Kill isolate runner - if let Some(isolate_runner) = &isolate_runner { - isolate_runner.signal(Signal::SIGKILL)?; - } - - for row in actor_rows { + // Kill all runners + for row in runner_rows { let Some(pid) = row.pid else { continue; }; - let config = serde_json::from_slice::(&row.config)?; - let generation = row.generation.try_into()?; - let metadata = config.metadata.deserialize()?; - - match &isolate_runner { - Some(isolate_runner) if pid == isolate_runner.pid().as_raw() => {} - _ => { - // Create a basic runner handle regardless of what the runner actually is (were just going to - // kill it). - let runner = runner::Handle::from_pid( - runner::Comms::Basic, - Pid::from_raw(pid), - self.actor_path(row.actor_id, generation), - ); - - // Kill runner - runner.signal(Signal::SIGKILL)?; + match kill(Pid::from_raw(pid), Signal::SIGKILL) { + Ok(_) => {} + Err(Errno::ESRCH) => { + tracing::warn!(?pid, "pid not found for signalling") } + Err(err) => return Err(err.into()), } - - // Clean up actor. We run `cleanup_setup` instead of `cleanup` because `cleanup` publishes events. - let actor = Actor::new(row.actor_id, generation, config, metadata); - actor.cleanup_setup(self).await; } // Stop any pending db operations @@ -805,7 +724,7 @@ impl Ctx { /// Rebuilds state from DB upon restart. async fn rebuild(self: &Arc, workflow_id: Uuid) -> Result<()> { - let ((last_event_idx,), actor_rows) = tokio::try_join!( + let ((last_event_idx,), runner_rows, actor_rows) = tokio::try_join!( // There should not be any database operations going on at this point so it is safe to read this // value utils::sql::query(|| async { @@ -820,106 +739,167 @@ impl Ctx { .fetch_one(&mut *self.sql().await?) .await }), + utils::sql::query(|| async { + sqlx::query_as::<_, RunnerRow>(indoc!( + " + SELECT runner_id, comms, config, pid + FROM runners + WHERE exit_ts IS NULL + ", + )) + .fetch_all(&mut *self.sql().await?) + .await + }), utils::sql::query(|| async { sqlx::query_as::<_, ActorRow>(indoc!( " - SELECT actor_id, generation, config, pid, stop_ts + SELECT actor_id, generation, config FROM actors WHERE exit_ts IS NULL ", )) .fetch_all(&mut *self.sql().await?) .await - }) + }), )?; self.rebuild_images_cache().await?; self.event_sender.set_idx(last_event_idx + 1); - let isolate_runner = { self.isolate_runner.read().await.clone() }; - - // NOTE: Sqlite doesn't support arrays, can't parallelize this easily // Emit stop events - for row in &actor_rows { - if row.pid.is_none() && row.stop_ts.is_none() { - tracing::error!(actor_id=?row.actor_id, "actor has no pid, stopping"); + for row in &runner_rows { + if row.pid.is_none() { + tracing::error!(runner_id=?row.runner_id, "runner has no pid, stopping"); utils::sql::query(|| async { sqlx::query(indoc!( " - UPDATE actors - SET stop_ts = ?3 - WHERE - actor_id = ?1 AND - generation = ?2 + UPDATE runners + SET stop_ts = ?2 + WHERE runner_id = ?1 ", )) - .bind(row.actor_id) - .bind(row.generation) + .bind(row.runner_id) .bind(utils::now()) .execute(&mut *self.sql().await?) .await }) .await?; - self.event(protocol::Event::ActorStateUpdate { - actor_id: row.actor_id, - generation: row.generation.try_into()?, - state: protocol::ActorState::Lost, + let actor_rows = utils::sql::query(|| async { + sqlx::query_as::<_, (Uuid, i64, Option)>(indoc!( + " + UPDATE actors + SET stop_ts = ?2 + WHERE runner_id = ?1 + RETURNING actor_id, generation, stop_ts + ", + )) + .bind(row.runner_id) + .bind(utils::now()) + .fetch_all(&mut *self.sql().await?) + .await }) .await?; + + for (actor_id, generation, stop_ts) in &actor_rows { + if stop_ts.is_none() { + self.event(protocol::Event::ActorStateUpdate { + actor_id: *actor_id, + generation: (*generation).try_into()?, + state: protocol::ActorState::Lost, + }) + .await?; + } + } } } - // Start actor observers + let mut runners_guard = self.runners.write().await; let mut actors_guard = self.actors.write().await; - for row in actor_rows { + + // Start runner observers + for row in runner_rows { let Some(pid) = row.pid else { continue; }; - let config = serde_json::from_slice::(&row.config)?; - let generation = row.generation.try_into()?; - let metadata = config.metadata.deserialize()?; + let config = serde_json::from_slice::(&row.config)?; + + let runner = match runner::setup::Comms::from_repr(row.comms.try_into()?) + .context("bad comms variant")? + { + runner::setup::Comms::Basic => Arc::new(Runner::from_pid( + row.runner_id, + runner::Comms::Basic, + config, + Pid::from_raw(pid), + )), + runner::setup::Comms::Socket => Arc::new(Runner::from_pid( + row.runner_id, + runner::Comms::socket(), + config, + Pid::from_raw(pid), + )), + }; + + runners_guard.insert(row.runner_id, runner.clone()); - let runner = match &isolate_runner { - // We have to clone the existing isolate runner handle instead of creating a new one so it - // becomes a shared reference - Some(isolate_runner) if pid == isolate_runner.pid().as_raw() => { - isolate_runner.clone() + let runner_id = row.runner_id; + let runner = runner.clone(); + let self2 = self.clone(); + tokio::spawn(async move { + if let Err(err) = runner.observe(&self2, false).await { + tracing::error!(?runner_id, ?err, "observe failed"); } - _ => match config.image.kind { - protocol::ImageKind::DockerImage | protocol::ImageKind::OciBundle => { - runner::Handle::from_pid( - runner::Comms::Basic, - Pid::from_raw(pid), - self.actor_path(row.actor_id, generation), - ) - } - protocol::ImageKind::JavaScript => runner::Handle::from_pid( - runner::Comms::socket(), - Pid::from_raw(pid), - self.actor_path(row.actor_id, generation), - ), - }, + + if let Err(err) = runner.cleanup(&self2).await { + tracing::error!(?runner_id, ?err, "cleanup failed"); + } + }); + } + + // Start actor observers + for row in actor_rows { + let config = serde_json::from_slice::(&row.config)?; + let runner_id = config + .runner + .as_ref() + .context("should have runner config")? + .runner_id(); + + let Some(runner) = runners_guard.get(&runner_id) else { + tracing::warn!(actor_id=?row.actor_id, ?runner_id, "actor's runner does not exist"); + continue; + }; + + // NOTE: No runner sockets are connected yet so there is no race condition with missed state + // updates here + let generation = row.generation.try_into()?; + let actor_observer = if let protocol::ImageAllocationType::Multi = + runner.config().image.allocation_type + { + Some(runner.new_actor_observer(row.actor_id, generation)) + } else { + None }; - let actor = Actor::with_runner(row.actor_id, generation, config, metadata, runner); let actor = actors_guard .entry((row.actor_id, generation)) - .or_insert(actor); + .or_insert(Actor::new(row.actor_id, generation, config, runner.clone())); + let actor_id = row.actor_id; let actor = actor.clone(); let self2 = self.clone(); tokio::spawn(async move { - if let Err(err) = actor.observe(&self2).await { - tracing::error!(actor_id=?row.actor_id, ?err, "observe failed"); + if let Err(err) = actor.observe(&self2, actor_observer).await { + tracing::error!(?actor_id, ?err, "observe failed"); } // Cleanup afterwards if let Err(err) = actor.cleanup(&self2).await { - tracing::error!(actor_id=?row.actor_id, ?err, "cleanup failed"); + tracing::error!(?actor_id, ?err, "cleanup failed"); } }); } @@ -1009,12 +989,12 @@ impl Ctx { &self.config.client } - pub fn actors_path(&self) -> PathBuf { - self.config().data_dir().join("actors") + pub fn runners_path(&self) -> PathBuf { + self.config().data_dir().join("runners") } - pub fn actor_path(&self, actor_id: Uuid, generation: u32) -> PathBuf { - self.actors_path().join(format!("{actor_id}-{generation}")) + pub fn runner_path(&self, runner_id: Uuid) -> PathBuf { + self.runners_path().join(runner_id.to_string()) } pub fn images_path(&self) -> PathBuf { diff --git a/packages/edge/infra/client/manager/src/lib.rs b/packages/edge/infra/client/manager/src/lib.rs index 24856d27a5..5239d4c7ad 100644 --- a/packages/edge/infra/client/manager/src/lib.rs +++ b/packages/edge/infra/client/manager/src/lib.rs @@ -1,5 +1,3 @@ -// TODO: Make tests work without this - // Test exports #[cfg(feature = "test")] diff --git a/packages/edge/infra/client/manager/src/main.rs b/packages/edge/infra/client/manager/src/main.rs index fa2ac4151c..42cf7d37e1 100644 --- a/packages/edge/infra/client/manager/src/main.rs +++ b/packages/edge/infra/client/manager/src/main.rs @@ -7,7 +7,7 @@ use std::{ use anyhow::*; use ctx::Ctx; use futures_util::StreamExt; -use pegboard::system_info::SystemInfo; +use pegboard::{protocol, system_info::SystemInfo}; use pegboard_config::Config; use rand::seq::{IteratorRandom, SliceRandom}; use service_discovery::ServiceDiscovery; @@ -230,7 +230,7 @@ async fn build_ws_url(config: &Config) -> Result { url.set_path(&format!("/v{PROTOCOL_VERSION}")); url.query_pairs_mut() .append_pair("client_id", &config.client.cluster.client_id.to_string()) - .append_pair("flavor", &config.client.runner.flavor.to_string()); + .append_pair("flavor", &protocol::ClientFlavor::Multi.to_string()); Ok(url) } diff --git a/packages/edge/infra/client/manager/src/metrics/mod.rs b/packages/edge/infra/client/manager/src/metrics/mod.rs index ae3d993b78..a47cdf35c9 100644 --- a/packages/edge/infra/client/manager/src/metrics/mod.rs +++ b/packages/edge/infra/client/manager/src/metrics/mod.rs @@ -23,20 +23,6 @@ lazy_static::lazy_static! { *REGISTRY, ).unwrap(); - pub static ref UNKNOWN_ISOLATE_RUNNER: IntCounterVec = register_int_counter_vec_with_registry!( - "unknown_isolate_runner", - "Total number of unknown isolate runners that were found and killed.", - &[], - *REGISTRY, - ).unwrap(); - - pub static ref DUPLICATE_RUNNER: IntCounterVec = register_int_counter_vec_with_registry!( - "duplicate_runner", - "Total number of duplicate runners that were found and killed.", - &["pid"], - *REGISTRY, - ).unwrap(); - pub static ref SQL_ERROR: IntCounterVec = register_int_counter_vec_with_registry!( "sql_error", "An SQL error occurred.", @@ -44,20 +30,6 @@ lazy_static::lazy_static! { *REGISTRY, ).unwrap(); - pub static ref SECOND_INIT: IntCounterVec = register_int_counter_vec_with_registry!( - "second_init", - "Total number of second init packets encountered.", - &[], - *REGISTRY, - ).unwrap(); - - pub static ref DOWNLOAD_IMAGE_DURATION: Histogram = register_histogram_with_registry!( - "download_image_duration", - "Duration of image download", - BUCKETS.to_vec(), - *REGISTRY, - ).unwrap(); - pub static ref DOWNLOAD_IMAGE_RATE: GaugeVec = register_gauge_vec_with_registry!( "download_image_rate", "Rate of image download in bytes/sec", @@ -108,13 +80,6 @@ lazy_static::lazy_static! { *REGISTRY, ).unwrap(); - pub static ref SETUP_ISOLATE_DURATION: Histogram = register_histogram_with_registry!( - "actor_setup_isolate_duration", - "Duration of isolate setup", - BUCKETS.to_vec(), - *REGISTRY, - ).unwrap(); - pub static ref SETUP_PARALLEL_TASKS_DURATION: Histogram = register_histogram_with_registry!( "actor_setup_parallel_tasks_duration", "Duration of parallel setup tasks (image download/fs + ports/network)", diff --git a/packages/edge/infra/client/manager/src/runner.rs b/packages/edge/infra/client/manager/src/runner.rs deleted file mode 100644 index ffe4e1878c..0000000000 --- a/packages/edge/infra/client/manager/src/runner.rs +++ /dev/null @@ -1,391 +0,0 @@ -use std::{ - os::unix::process::CommandExt, - path::{Path, PathBuf}, - process::Stdio, - result::Result::{Err, Ok}, - sync::Arc, - time::Duration, -}; - -use anyhow::*; -use futures_util::{ - stream::{FuturesUnordered, SplitSink}, - FutureExt, SinkExt, StreamExt, -}; -use nix::{ - errno::Errno, - sys::{ - signal::{kill, Signal}, - wait::{waitpid, WaitStatus}, - }, - unistd::{fork, pipe, read, setsid, write, ForkResult, Pid}, -}; -use pegboard_config::runner_protocol; -use tokio::{fs, net::TcpStream, sync::Mutex}; -use tokio_tungstenite::{ - tungstenite::protocol::{ - frame::{coding::CloseCode, CloseFrame}, - Message, - }, - WebSocketStream, -}; - -use crate::{metrics, utils}; - -/// How often to check that a PID is still running when observing actor state. -const PID_POLL_INTERVAL: Duration = Duration::from_millis(1000); -/// How long before killing a runner with a socket if it has not pinged. -const PING_TIMEOUT: Duration = Duration::from_secs(5); - -#[derive(Debug)] -enum ObservationState { - Exited, - Running, - Dead, -} - -// NOTE: Cloneable because this is just a handle -#[derive(Clone)] -pub struct Handle { - pid: Pid, - working_path: PathBuf, - comms: Comms, -} - -impl Handle { - pub fn from_pid(comms: Comms, pid: Pid, working_path: PathBuf) -> Self { - Handle { - pid, - working_path, - comms, - } - } - - pub async fn attach_socket(&self, mut ws_stream: WebSocketStream) -> Result<()> { - match &self.comms { - Comms::Basic => bail!("attempt to attach socket to basic runner"), - Comms::Socket(tx) => { - tracing::info!(pid=?self.pid, "attaching socket"); - - let mut guard = tx.lock().await; - - if guard.is_none() { - let (ws_tx, mut ws_rx) = ws_stream.split(); - - *guard = Some(ws_tx); - - // Spawn a new thread to handle incoming messages - let self2 = self.clone(); - tokio::task::spawn(async move { - let kill = loop { - match tokio::time::timeout(PING_TIMEOUT, ws_rx.next()).await { - Ok(msg) => match msg { - Some(Ok(Message::Ping(_))) => {} - Some(Ok(Message::Close(_))) | None => { - tracing::debug!(pid=?self2.pid, "runner socket closed"); - break false; - } - Some(Ok(msg)) => { - tracing::warn!(pid=?self2.pid, ?msg, "unexpected message in runner socket") - } - Some(Err(err)) => { - tracing::error!(pid=?self2.pid, ?err, "runner socket error"); - break true; - } - }, - Err(_) => { - tracing::error!(pid=?self2.pid, "socket timed out, killing runner"); - - break true; - } - } - }; - - if kill { - if let Err(err) = self2.signal(Signal::SIGKILL) { - // TODO: This should hard error the manager? - tracing::error!(pid=?self2.pid, %err, "failed to kill runner"); - } - } - }); - - tracing::info!(pid=?self.pid, "socket attached"); - } else { - tracing::warn!(pid=?self.pid, "runner received another socket, terminating new one"); - - metrics::DUPLICATE_RUNNER - .with_label_values(&[&self.pid.to_string()]) - .inc(); - - ws_stream - .send(Message::Binary(serde_json::to_vec( - &runner_protocol::ToRunner::Terminate, - )?)) - .await?; - - let close_frame = CloseFrame { - code: CloseCode::Error, - reason: "unknown runner".into(), - }; - ws_stream.send(Message::Close(Some(close_frame))).await?; - } - } - } - - Ok(()) - } - - pub async fn send(&self, packet: &runner_protocol::ToRunner) -> Result<()> { - match &self.comms { - Comms::Basic => bail!("cannot send socket message to basic runner"), - Comms::Socket(socket) => { - // Wait for socket to connect in a retry loop - let mut attempts = 0; - let mut guard = loop { - { - let guard = socket.lock().await; - if guard.is_some() { - break guard; - } - } - - if attempts == 0 { - tracing::warn!(pid=?self.pid, "socket not yet attached, can't send message. retrying"); - } - - attempts += 1; - if attempts > 15 { - bail!( - "timed out waiting for runner socket (pid {}) to attach", - self.pid - ); - } - - tokio::time::sleep(std::time::Duration::from_millis(125)).await; - }; - - let socket = guard.as_mut().expect("should exist"); - let buf = serde_json::to_vec(packet)?; - socket - .send(Message::Binary(buf)) - .await - .context("failed to send packet to socket")?; - } - } - - Ok(()) - } - - pub fn spawn_orphaned( - comms: Comms, - runner_binary_path: &Path, - container_id: &str, - working_path: PathBuf, - env: &[(&str, String)], - ) -> Result { - // Prepare the arguments for the runner - let runner_args = vec![working_path.to_str().context("bad path")?, container_id]; - - // NOTE: Pipes are automatically closed on drop - // Pipe communication between processes - let (pipe_read, pipe_write) = pipe()?; - - // NOTE: This is why we fork the process twice: https://stackoverflow.com/a/5386753 - match unsafe { fork() }.context("process first fork failed")? { - ForkResult::Parent { child } => { - // Close the writing end of the pipe in the parent - nix::unistd::close(pipe_write)?; - - // Ensure that the child process spawned successfully - match waitpid(child, None).context("waitpid failed")? { - WaitStatus::Exited(_, 0) => { - // Read the second child's PID from the pipe - let mut buf = [0u8; 4]; - read(pipe_read, &mut buf)?; - let orphan_pid = Pid::from_raw(i32::from_le_bytes(buf)); - - Ok(Handle { - pid: orphan_pid, - working_path, - comms, - }) - } - WaitStatus::Exited(_, status) => { - bail!("Child process exited with status {}", status) - } - _ => bail!("Unexpected wait status for child process"), - } - } - ForkResult::Child => { - // Child process - match unsafe { fork() } { - Result::Ok(ForkResult::Parent { child }) => { - // Write the second child's PID to the pipe - let orphan_pid_bytes = child.as_raw().to_le_bytes(); - write(pipe_write, &orphan_pid_bytes)?; - - // Exit the intermediate child - std::process::exit(0); - } - Result::Ok(ForkResult::Child) => { - // Disassociate from the parent by creating a new session - setsid().context("setsid failed")?; - - // Adjust nice, cpu priority, and OOM score - let pid = std::process::id() as i32; - utils::libc::set_nice_level(pid, 0).context("failed to set nice level")?; - utils::libc::set_oom_score_adj(pid, 0) - .context("failed to set oom score adjustment")?; - utils::libc::set_scheduling_policy( - pid, - utils::libc::SchedPolicy::Other, - // Must be 0 with SCHED_OTHER - 0, - ) - .context("failed to set scheduling policy")?; - - // Exit immediately on fail in order to not leak process - let err = std::process::Command::new(&runner_binary_path) - .args(&runner_args) - .envs(env.iter().cloned()) - .stdin(Stdio::null()) - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .exec(); - eprintln!("exec failed: {err:?}"); - std::process::exit(1); - } - Err(err) => { - // Exit immediately in order to not leak child process. - eprintln!("process second fork failed: {err:?}"); - std::process::exit(1); - } - } - } - } - } - - pub async fn observe(&self) -> Result> { - let exit_code_path = self.working_path.join("exit-code"); - let proc_path = Path::new("/proc").join(self.pid.to_string()); - - let mut futs = FuturesUnordered::new(); - - // Watch for exit code file being written - futs.push( - async { - utils::wait_for_write(&exit_code_path).await?; - - anyhow::Ok(ObservationState::Exited) - } - .boxed(), - ); - - // Polling interval to check that the pid still exists - futs.push( - async { - tokio::time::sleep(PID_POLL_INTERVAL).await; - - if fs::metadata(&proc_path).await.is_ok() { - anyhow::Ok(ObservationState::Running) - } else { - anyhow::Ok(ObservationState::Dead) - } - } - .boxed(), - ); - - let state = loop { - // Get next complete future - if let Some(state) = futs.next().await { - let state = state?; - - // If still running, add poll future back to list - if let ObservationState::Running = state { - futs.push( - async { - tokio::time::sleep(PID_POLL_INTERVAL).await; - - if fs::metadata(&proc_path).await.is_ok() { - Ok(ObservationState::Running) - } else { - Ok(ObservationState::Dead) - } - } - .boxed(), - ); - } else { - break state; - } - } else { - bail!("observation failed, developer error"); - } - }; - - let exit_code = if let ObservationState::Exited = state { - use std::result::Result::Ok; - match fs::read_to_string(&exit_code_path).await { - Ok(contents) => match contents.trim().parse::() { - Ok(x) => Some(x), - Err(err) => { - tracing::error!(pid=?self.pid, ?err, "failed to parse exit code file"); - - None - } - }, - Err(err) => { - tracing::error!(pid=?self.pid, ?err, "failed to read exit code file"); - - None - } - } - } else { - tracing::warn!(pid=?self.pid, "process died before exit code file was written"); - - None - }; - - tracing::info!(pid=?self.pid, ?exit_code, "exited"); - - Ok(exit_code) - } - - pub fn signal(&self, signal: Signal) -> Result<()> { - // https://pubs.opengroup.org/onlinepubs/9699919799/functions/kill.html - if (signal as i32) < 1 { - bail!("signals < 1 not allowed"); - } - - match kill(self.pid, signal) { - Ok(_) => {} - Err(Errno::ESRCH) => { - tracing::warn!(pid=?self.pid, "pid not found for signalling") - } - Err(err) => return Err(err.into()), - } - - Ok(()) - } -} - -impl Handle { - pub fn pid(&self) -> &Pid { - &self.pid - } - - pub fn has_socket(&self) -> bool { - matches!(self.comms, Comms::Socket(_)) - } -} - -#[derive(Clone)] -pub enum Comms { - Basic, - Socket(Arc, Message>>>>), -} - -impl Comms { - pub fn socket() -> Self { - Comms::Socket(Arc::new(Mutex::new(None))) - } -} diff --git a/packages/edge/infra/client/manager/src/runner/mod.rs b/packages/edge/infra/client/manager/src/runner/mod.rs new file mode 100644 index 0000000000..75392e2942 --- /dev/null +++ b/packages/edge/infra/client/manager/src/runner/mod.rs @@ -0,0 +1,666 @@ +use std::{ + path::Path, + result::Result::{Err, Ok}, + sync::Arc, + time::Duration, +}; + +use anyhow::*; +use futures_util::{ + stream::{FuturesUnordered, SplitSink, SplitStream}, + FutureExt, SinkExt, StreamExt, +}; +use indoc::indoc; +use nix::{ + errno::Errno, + sys::signal::{kill, Signal}, + unistd::Pid, +}; +use pegboard::protocol; +use pegboard_config::runner_protocol; +use tokio::{ + fs, + net::TcpStream, + sync::{broadcast, Mutex, RwLock}, +}; +use tokio_tungstenite::{ + tungstenite::protocol::{ + frame::{coding::CloseCode, CloseFrame}, + Message, + }, + WebSocketStream, +}; +use uuid::Uuid; + +use crate::{ctx::Ctx, utils}; + +mod oci_config; +mod partial_oci_config; +mod seccomp; +pub(crate) mod setup; + +/// How often to check that a PID is still running when observing actor state. +const PID_POLL_INTERVAL: Duration = Duration::from_millis(1000); +/// How long before killing a runner with a socket if it has not pinged. +const PING_TIMEOUT: Duration = Duration::from_secs(5); +/// How long to wait when waiting for the socket to become ready before timing out. +const SOCKET_READY_TIMEOUT: Duration = Duration::from_secs(3); +/// How long to wait when getting the PID before timing out. +const GET_PID_TIMEOUT: Duration = Duration::from_secs(256); +// IMPORTANT: This cannot be just `rivet-` because this is used as a prefix to filter cgroup names +// in cadvisor. +// +// If this was "rivet-", we'd have to report on non-actor cgroups with cadvisor. +// +// See also packages/core/services/cluster/src/workflows/server/install/install_scripts/files/cadvisor_metric_exporter.sh & packages/core/api/actor/src/route/metrics.rs +pub const RIVET_CONTAINER_PREFIX: &str = "pegboard-actor-"; + +#[derive(sqlx::FromRow)] +pub struct ProxiedPortRow { + label: String, + source: i64, + target: Option, + protocol: i64, +} + +#[derive(Debug)] +enum ObservationState { + Exited, + Running, + Dead, +} + +pub struct Runner { + runner_id: Uuid, + comms: Comms, + config: protocol::RunnerConfig, + + pid: RwLock>, + + /// Used instead of polling loops for faster updates. + bump_channel: broadcast::Sender<()>, + + actor_observer_tx: broadcast::Sender<(Uuid, u32, runner_protocol::ActorState)>, +} + +impl Runner { + pub fn new(runner_id: Uuid, comms: Comms, config: protocol::RunnerConfig) -> Self { + Runner { + runner_id, + comms, + config, + pid: RwLock::new(None), + bump_channel: broadcast::channel(2).0, + actor_observer_tx: broadcast::channel(16).0, + } + } + + pub fn from_pid( + runner_id: Uuid, + comms: Comms, + config: protocol::RunnerConfig, + pid: Pid, + ) -> Self { + Runner { + runner_id, + comms, + config, + pid: RwLock::new(Some(pid)), + bump_channel: broadcast::channel(1).0, + actor_observer_tx: broadcast::channel(16).0, + } + } + + fn bump(&self) { + let _ = self.bump_channel.send(()); + } + + pub async fn attach_socket( + self: &Arc, + mut ws_stream: WebSocketStream, + ) -> Result<()> { + match &self.comms { + Comms::Basic => bail!("attempt to attach socket to basic runner"), + Comms::Socket(tx) => { + tracing::info!(runner_id=?self.runner_id, "attaching socket"); + + let mut guard = tx.lock().await; + + if guard.is_none() { + let (ws_tx, ws_rx) = ws_stream.split(); + + *guard = Some(ws_tx); + self.bump(); + + // Spawn a new thread to handle incoming messages + let self2 = self.clone(); + tokio::task::spawn(async move { + if let Err(err) = self2.receive_messages(ws_rx).await { + tracing::error!(runner_id=?self2.runner_id, ?err, "socket error, killing runner"); + + if let Err(err) = self2.signal(Signal::SIGKILL).await { + // TODO: This should hard error the manager? + tracing::error!(runner_id=?self2.runner_id, %err, "failed to kill runner"); + } + } + }); + + tracing::info!(runner_id=?self.runner_id, "socket attached"); + } else { + tracing::warn!(runner_id=?self.runner_id, "runner received another socket, closing new one"); + + let close_frame = CloseFrame { + code: CloseCode::Error, + reason: "unknown runner".into(), + }; + ws_stream.send(Message::Close(Some(close_frame))).await?; + } + } + } + + Ok(()) + } + + async fn receive_messages( + &self, + mut ws_rx: SplitStream>, + ) -> Result<()> { + loop { + match tokio::time::timeout(PING_TIMEOUT, ws_rx.next()).await { + Ok(msg) => match msg { + Some(Ok(Message::Ping(_))) => { + // Pongs are sent automatically + } + Some(Ok(Message::Close(_))) | None => { + tracing::debug!(runner_id=?self.runner_id, "runner socket closed"); + break Ok(()); + } + Some(Ok(Message::Binary(buf))) => { + let packet = serde_json::from_slice::(&buf)?; + + self.process_packet(packet).await?; + } + Some(Ok(packet)) => bail!("runner socket unexpected packet: {packet:?}"), + Some(Err(err)) => break Err(err).context("runner socket error"), + }, + Err(_) => bail!("socket timed out"), + } + } + } + + async fn process_packet(&self, packet: runner_protocol::ToManager) -> Result<()> { + tracing::debug!(?packet, "runner received packet"); + + match packet { + runner_protocol::ToManager::Init { .. } => bail!("unexpected second init packet"), + runner_protocol::ToManager::ActorStateUpdate { + actor_id, + generation, + state, + } => { + // NOTE: No receivers is not an error + let _ = self.actor_observer_tx.send((actor_id, generation, state)); + } + } + + Ok(()) + } + + pub async fn send(&self, packet: &runner_protocol::ToRunner) -> Result<()> { + match &self.comms { + Comms::Basic => bail!("cannot send socket message to basic runner"), + Comms::Socket(socket) => { + let mut sub = self.bump_channel.subscribe(); + + // Wait for socket to connect in a retry loop + let mut guard = tokio::time::timeout(SOCKET_READY_TIMEOUT, async { + loop { + { + let guard = socket.lock().await; + if guard.is_some() { + break anyhow::Ok(guard); + } + } + + tracing::warn!( + runner_id=?self.runner_id, + "socket not yet attached, can't send message. retrying", + ); + + sub.recv().await.context("bump channel closed")?; + } + }) + .await + .with_context(|| { + format!( + "timed out waiting for runner {} socket to attach", + self.runner_id + ) + })??; + + let socket = guard.as_mut().expect("should exist"); + let buf = serde_json::to_vec(packet)?; + socket + .send(Message::Binary(buf)) + .await + .context("failed to send packet to socket")?; + } + } + + Ok(()) + } + + pub async fn start( + self: &Arc, + ctx: &Arc, + ) -> Result> { + tracing::info!(runner_id=?self.runner_id, "starting"); + + // Write runner to DB + let config_json = serde_json::to_vec(&self.config)?; + + utils::sql::query(|| async { + // NOTE: On conflict here in case this query runs but the command is not acknowledged + sqlx::query(indoc!( + " + INSERT INTO runners ( + runner_id, + comms, + config, + start_ts + ) + VALUES (?1, ?2, ?3, ?4) + ON CONFLICT (runner_id) DO NOTHING + ", + )) + .bind(self.runner_id) + .bind(if self.has_socket() { + setup::Comms::Socket + } else { + setup::Comms::Basic + } as i32) + .bind(&config_json) + .bind(utils::now()) + .execute(&mut *ctx.sql().await?) + .await + }) + .await?; + + // Setup needs to occur outside of spawned task because the ports are returned + let proxied_ports = match self.setup(&ctx).await { + Ok(proxied_ports) => proxied_ports, + Err(err) => { + tracing::error!(runner_id=?self.runner_id, ?err, "setup failed"); + + // Cleanup afterwards + if let Err(err) = self.cleanup(&ctx).await { + tracing::error!(runner_id=?self.runner_id, ?err, "cleanup failed"); + } + + return Err(err); + } + }; + + // Lifecycle + let self2 = self.clone(); + let ctx2 = ctx.clone(); + tokio::spawn(async move { + match self2.run(&ctx2).await { + Ok(_) => { + if let Err(err) = self2.observe(&ctx2, false).await { + tracing::error!(runner_id=?self2.runner_id, ?err, "observe failed"); + } + } + Err(err) => { + tracing::error!(runner_id=?self2.runner_id, ?err, "run failed") + } + } + + // Cleanup afterwards + if let Err(err) = self2.cleanup(&ctx2).await { + tracing::error!(runner_id=?self2.runner_id, ?err, "cleanup failed"); + } + }); + + Ok(proxied_ports) + } + + async fn run(&self, ctx: &Ctx) -> Result<()> { + // NOTE: This is the env that goes to the container-runner process, NOT the env that is inserted into + // the container. + let mut runner_env = vec![ + ( + "ROOT_USER_ENABLED", + if self.config.root_user_enabled { + "1" + } else { + "0" + } + .to_string(), + ), + ("RUNNER_ID", self.runner_id.to_string()), + ]; + if let Some(vector) = &ctx.config().vector { + runner_env.push(("VECTOR_SOCKET_ADDR", vector.address.to_string())); + } + + self.spawn_orphaned(ctx, &runner_env).await + } + + // Silent prevents dupe logs, this function is called for every actor running on this runner as well as + // for the runner's observer task + pub async fn observe(&self, ctx: &Ctx, silent: bool) -> Result> { + let pid = self.pid().await?; + + let runner_path = ctx.runner_path(self.runner_id); + let exit_code_path = runner_path.join("exit-code"); + let proc_path = Path::new("/proc").join(pid.to_string()); + + let mut futs = FuturesUnordered::new(); + + // Watch for exit code file being written + futs.push( + async { + utils::wait_for_write(&exit_code_path).await?; + + anyhow::Ok(ObservationState::Exited) + } + .boxed(), + ); + + // Polling interval to check that the pid still exists + futs.push( + async { + tokio::time::sleep(PID_POLL_INTERVAL).await; + + if fs::metadata(&proc_path).await.is_ok() { + anyhow::Ok(ObservationState::Running) + } else { + anyhow::Ok(ObservationState::Dead) + } + } + .boxed(), + ); + + let state = loop { + // Get next complete future + let state = futs + .next() + .await + .context("observation failed, developer error")??; + + // If still running, add poll future back to list + if let ObservationState::Running = state { + futs.push( + async { + tokio::time::sleep(PID_POLL_INTERVAL).await; + + if fs::metadata(&proc_path).await.is_ok() { + Ok(ObservationState::Running) + } else { + Ok(ObservationState::Dead) + } + } + .boxed(), + ); + } else { + break state; + } + }; + + let exit_code = if let ObservationState::Exited = state { + use std::result::Result::Ok; + match fs::read_to_string(&exit_code_path).await { + Ok(contents) => match contents.trim().parse::() { + Ok(x) => Some(x), + Err(err) => { + if !silent { + tracing::error!(runner_id=?self.runner_id, ?err, "failed to parse exit code file"); + } + + None + } + }, + Err(err) => { + if !silent { + tracing::error!(runner_id=?self.runner_id, ?err, "failed to read exit code file"); + } + + None + } + } + } else { + if !silent { + tracing::warn!(runner_id=?self.runner_id, "process died before exit code file was written"); + } + + None + }; + + if !silent { + tracing::info!(runner_id=?self.runner_id, ?exit_code, "exited"); + } + + self.set_exit_code(ctx, exit_code).await?; + + Ok(exit_code) + } + + pub fn new_actor_observer(&self, actor_id: Uuid, generation: u32) -> ActorObserver { + ActorObserver::new(actor_id, generation, self.actor_observer_tx.subscribe()) + } + + pub async fn signal(&self, signal: Signal) -> Result<()> { + // https://pubs.opengroup.org/onlinepubs/9699919799/functions/kill.html + if (signal as i32) < 1 { + bail!("signals < 1 not allowed"); + } + + let pid = self.pid().await?; + + match kill(pid, signal) { + Ok(_) => {} + Err(Errno::ESRCH) => { + tracing::warn!(?pid, "pid not found for signalling") + } + Err(err) => return Err(err.into()), + } + + Ok(()) + } + + #[tracing::instrument(skip_all)] + pub async fn set_exit_code(&self, ctx: &Ctx, exit_code: Option) -> Result<()> { + // Update DB + utils::sql::query(|| async { + sqlx::query(indoc!( + " + UPDATE runners + SET + exit_ts = ?2, + exit_code = ?3 + WHERE + runner_id = ?1 AND + exit_ts IS NULL + ", + )) + .bind(self.runner_id) + .bind(utils::now()) + .bind(exit_code) + .execute(&mut *ctx.sql().await?) + .await + }) + .await?; + + Ok(()) + } + + #[tracing::instrument(skip_all)] + pub async fn cleanup(&self, ctx: &Ctx) -> Result<()> { + tracing::info!(runner_id=?self.runner_id, "cleaning up runner"); + + // Set exit code if it hasn't already been set + self.set_exit_code(ctx, None).await?; + + // Unbind ports + utils::sql::query(|| async { + sqlx::query(indoc!( + " + UPDATE runner_ports + SET delete_ts = ?3 + WHERE + runner_id = ?1 + ", + )) + .bind(self.runner_id) + .bind(utils::now()) + .execute(&mut *ctx.sql().await?) + .await + }) + .await?; + + // Cleanup setup. Should only be called after the exit code is set successfully for consistent state + self.cleanup_setup(ctx).await; + + // It is important that we remove from the runners list last so that we prevent duplicates + { + let mut runners = ctx.runners.write().await; + runners.remove(&self.runner_id); + } + + Ok(()) + } +} + +impl Runner { + pub fn config(&self) -> &protocol::RunnerConfig { + &self.config + } + + pub fn container_id(&self) -> String { + format!("{RIVET_CONTAINER_PREFIX}{}", self.runner_id) + } + + pub async fn ports( + &self, + ctx: &Ctx, + ) -> Result> { + let rows = utils::sql::query(|| async { + sqlx::query_as::<_, ProxiedPortRow>(indoc!( + " + SELECT label, source, target, protocol FROM runner_ports + WHERE + runner_id = ?1 AND + delete_ts IS NULL + ", + )) + .bind(self.runner_id) + .fetch_all(&mut *ctx.sql().await?) + .await + }) + .await?; + + rows.into_iter() + .map(|row| { + let source = row.source.try_into()?; + + Ok(( + row.label, + protocol::ProxiedPort { + source, + target: row + .target + .map(TryInto::try_into) + .transpose()? + .unwrap_or(source), + lan_hostname: ctx.config().network.lan_hostname.clone(), + protocol: protocol::TransportProtocol::from_repr(row.protocol.try_into()?) + .context("bad port protocol")?, + }, + )) + }) + .collect() + } + + pub async fn pid(&self) -> Result { + let mut sub = self.bump_channel.subscribe(); + let mut i = 0; + + tokio::time::timeout(GET_PID_TIMEOUT, async { + loop { + { + if let Some(pid) = *self.pid.read().await { + break anyhow::Ok(pid); + } + } + + // Progress log + if i % 10 == 0 { + tracing::warn!( + runner_id=?self.runner_id, + "waiting for pid of runner", + ); + } + + i += 1; + + sub.recv().await.context("bump channel closed")?; + } + }) + .await + .with_context(|| { + format!( + "timed out waiting for runner {} to get PID, considering runner stopped", + self.runner_id, + ) + })? + } + + pub fn has_socket(&self) -> bool { + matches!(self.comms, Comms::Socket(_)) + } +} + +pub enum Comms { + Basic, + Socket(Mutex, Message>>>), +} + +impl Comms { + pub fn socket() -> Self { + Comms::Socket(Mutex::new(None)) + } +} + +pub struct ActorObserver { + actor_id: Uuid, + generation: u32, + sub: broadcast::Receiver<(Uuid, u32, runner_protocol::ActorState)>, +} + +impl ActorObserver { + fn new( + actor_id: Uuid, + generation: u32, + sub: broadcast::Receiver<(Uuid, u32, runner_protocol::ActorState)>, + ) -> Self { + ActorObserver { + actor_id, + generation, + sub, + } + } + pub async fn next(&mut self) -> Option { + loop { + let Ok((other_actor_id, other_generation, state)) = self.sub.recv().await else { + tracing::error!("actor observer channel dropped"); + + return None; + }; + + if self.actor_id == other_actor_id && self.generation == other_generation { + return Some(state); + } + } + } +} diff --git a/packages/edge/infra/client/manager/src/actor/oci_config.rs b/packages/edge/infra/client/manager/src/runner/oci_config.rs similarity index 92% rename from packages/edge/infra/client/manager/src/actor/oci_config.rs rename to packages/edge/infra/client/manager/src/runner/oci_config.rs index aff0ae4776..3ab8196ed9 100644 --- a/packages/edge/infra/client/manager/src/actor/oci_config.rs +++ b/packages/edge/infra/client/manager/src/runner/oci_config.rs @@ -5,7 +5,7 @@ use std::path::Path; use super::{partial_oci_config::PartialOciConfigUser, seccomp}; pub struct ConfigOpts<'a> { - pub actor_path: &'a Path, + pub runner_path: &'a Path, pub netns_path: &'a Path, pub args: Vec, pub env: Vec, @@ -22,7 +22,7 @@ pub struct ConfigOpts<'a> { /// Sanitize the config.json by copying safe properties from the provided bundle in to our base config. pub fn config(opts: ConfigOpts) -> Result { // CPU shares is a relative weight. It doesn't matter what unit we pass here as - // long as the ratios between the actors are correct. + // long as the ratios between the runners are correct. // // Corresponds to cpu.weight in cgroups. Must be [1, 10_000] // @@ -36,15 +36,15 @@ pub fn config(opts: ConfigOpts) -> Result { tracing::warn!(?cpu_shares, "cpu_shares < 1"); } - // This is a modified version of the default config.json generated by actord. + // This is a modified version of the default config.json generated by containerd. // // Some values will be overridden at runtime by the values in the OCI bundle's config.json. // // Default Docker spec: https://github.com/moby/moby/blob/777e9f271095685543f30df0ff7a12397676f938/oci/defaults.go#L49 // - // Generate config.json with actord: - // ctr run --rm -t --seccomp docker.io/library/debian:latest debian-actor-id /bin/bash - // cat /run/actord/io.actord.runtime.v2.task/default/debian-actor-id/config.json | jq + // Generate config.json with containerd: + // ctr run --rm -t --seccomp docker.io/library/debian:latest debian-container-id /bin/bash + // cat /run/containerd/io.containerd.runtime.v2.task/default/debian-container-id/config.json | jq Ok(json!({ "ociVersion": "1.0.2-dev", "process": { @@ -249,7 +249,7 @@ fn mounts(opts: &ConfigOpts) -> Result { { "destination": "/etc/resolv.conf", "type": "bind", - "source": opts.actor_path.join("resolv.conf").to_str().context("resolv.conf path")?, + "source": opts.runner_path.join("resolv.conf").to_str().context("resolv.conf path")?, "options": ["rbind", "rprivate"] }, { @@ -264,8 +264,8 @@ fn mounts(opts: &ConfigOpts) -> Result { fn linux_resources_devices() -> serde_json::Value { // Devices implicitly contains the following devices: // null, zero, full, random, urandom, tty, console, and ptmx. - // ptmx is a bind mount or symlink of the actor's ptmx. - // See also: https://github.com/openactors/runtime-spec/blob/master/config-linux.md#default-devices + // ptmx is a bind mount or symlink of the container's ptmx. + // See also: https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#default-devices json!([ { "allow": false, diff --git a/packages/edge/infra/client/manager/src/actor/partial_oci_config.rs b/packages/edge/infra/client/manager/src/runner/partial_oci_config.rs similarity index 100% rename from packages/edge/infra/client/manager/src/actor/partial_oci_config.rs rename to packages/edge/infra/client/manager/src/runner/partial_oci_config.rs diff --git a/packages/edge/infra/client/manager/src/actor/seccomp.rs b/packages/edge/infra/client/manager/src/runner/seccomp.rs similarity index 99% rename from packages/edge/infra/client/manager/src/actor/seccomp.rs rename to packages/edge/infra/client/manager/src/runner/seccomp.rs index f84b210fdf..c82c2c94d8 100644 --- a/packages/edge/infra/client/manager/src/actor/seccomp.rs +++ b/packages/edge/infra/client/manager/src/runner/seccomp.rs @@ -1,6 +1,6 @@ use serde_json::json; -// Copied from auto-generated actord +// Copied from auto-generated containerd // // See comment in super::oci_conifg::config on how to generate this pub fn config() -> serde_json::Value { diff --git a/packages/edge/infra/client/manager/src/actor/setup.rs b/packages/edge/infra/client/manager/src/runner/setup.rs similarity index 59% rename from packages/edge/infra/client/manager/src/actor/setup.rs rename to packages/edge/infra/client/manager/src/runner/setup.rs index 9d7336d04a..d9fad2e1dd 100644 --- a/packages/edge/infra/client/manager/src/actor/setup.rs +++ b/packages/edge/infra/client/manager/src/runner/setup.rs @@ -1,44 +1,109 @@ use std::{ collections::HashMap, + os::unix::process::CommandExt, path::{Path, PathBuf}, + process::Stdio, result::Result::{Err, Ok}, + sync::Arc, time::Instant, }; use anyhow::*; use indoc::indoc; +use nix::{ + sys::wait::{waitpid, WaitStatus}, + unistd::{fork, pipe, read, setsid, write, ForkResult, Pid}, +}; use pegboard::protocol; -use pegboard_config::isolate_runner::actor as actor_config; use rand::Rng; use serde_json::json; +use sqlx::Acquire; +use strum::FromRepr; use tokio::{ fs::{self, File}, process::Command, }; use uuid::Uuid; -use super::{oci_config, Actor}; +use super::{oci_config, Runner}; use crate::{ctx::Ctx, utils}; -impl Actor { +#[derive(Hash, Debug, Clone, Copy, PartialEq, Eq, FromRepr)] +pub enum Comms { + Basic = 0, + Socket = 1, +} + +impl Runner { + pub async fn setup( + self: &Arc, + ctx: &Arc, + ) -> Result> { + let setup_start_instant = std::time::Instant::now(); + tracing::info!(runner_id=?self.runner_id, "setting up runner"); + + tracing::info!(runner_id=?self.runner_id, "creating runner working directory"); + + let runner_path = ctx.runner_path(self.runner_id); + fs::create_dir(&runner_path) + .await + .context("failed to create runner dir")?; + + tracing::info!(runner_id=?self.runner_id, "starting setup tasks"); + let tasks_start_instant = Instant::now(); + + let (_, ports) = tokio::try_join!( + async { + self.make_fs(&ctx).await?; + self.download_image(&ctx).await?; + + Result::<(), anyhow::Error>::Ok(()) + }, + async { + let ports = self.bind_ports(ctx).await?; + + if let protocol::NetworkMode::Bridge = self.config.network_mode { + self.setup_cni_network(&ctx, &ports).await?; + } + + Ok(ports) + }, + )?; + + crate::metrics::SETUP_PARALLEL_TASKS_DURATION + .observe(tasks_start_instant.elapsed().as_secs_f64()); + tracing::info!( + runner_id=?self.runner_id, + "setup tasks completed" + ); + + tracing::info!(runner_id=?self.runner_id, "setting up runtime environment"); + self.setup_oci_bundle(&ctx, &ports).await?; + + crate::metrics::SETUP_TOTAL_DURATION.observe(setup_start_instant.elapsed().as_secs_f64()); + tracing::info!(runner_id=?self.runner_id, "runner setup completed"); + + Ok(ports) + } + pub async fn make_fs(&self, ctx: &Ctx) -> Result<()> { let timer = Instant::now(); - let actor_path = ctx.actor_path(self.actor_id, self.generation); + let runner_path = ctx.runner_path(self.runner_id); - let fs_img_path = actor_path.join("fs.img"); - let fs_path = actor_path.join("fs"); + let fs_img_path = runner_path.join("fs.img"); + let fs_path = runner_path.join("fs"); let fs_upper_path = fs_path.join("upper"); let fs_work_path = fs_path.join("work"); let image_path = ctx.image_path(self.config.image.id); - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "creating fs"); + tracing::info!(runner_id=?self.runner_id, "creating fs"); fs::create_dir(&fs_path) .await - .context("failed to create actor fs dir")?; + .context("failed to create runner fs dir")?; if ctx.config().runner.use_mounts() { - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "creating disk image"); + tracing::info!(runner_id=?self.runner_id, "creating disk image"); // Create a zero-filled file let fs_img = File::create(&fs_img_path) .await @@ -48,7 +113,7 @@ impl Actor { .await .context("failed to set disk image length")?; - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "formatting disk image"); + tracing::info!(runner_id=?self.runner_id, "formatting disk image"); // Format file as ext4 let cmd_out = Command::new("mkfs.ext4") .arg(&fs_img_path) @@ -62,7 +127,7 @@ impl Actor { std::str::from_utf8(&cmd_out.stderr)? ); - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "mounting disk image"); + tracing::info!(runner_id=?self.runner_id, "mounting disk image"); // Mount fs img as loop mount let cmd_out = Command::new("mount") @@ -88,7 +153,7 @@ impl Actor { .await .context("failed to create actor fs work dir")?; - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "mounting overlay"); + tracing::info!(runner_id=?self.runner_id, "mounting overlay"); ensure!( fs::metadata(&image_path).await.is_ok(), @@ -104,7 +169,7 @@ impl Actor { .arg("-t") .arg("overlay") // Arbitrary device name - .arg(format!("{}-{}", self.actor_id, self.generation)) + .arg(self.runner_id.to_string()) .arg("-o") .arg(format!( "lowerdir={},upperdir={},workdir={}", @@ -128,7 +193,7 @@ impl Actor { .await .context("failed to create actor fs upper dir")?; - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "copying image contents to fs"); + tracing::info!(runner_id=?self.runner_id, "copying image contents to fs"); // Copy everything from the image (lowerdir) to the upperdir utils::copy_dir_all(image_path, &fs_upper_path) @@ -139,8 +204,7 @@ impl Actor { let duration = timer.elapsed().as_secs_f64(); crate::metrics::SETUP_MAKE_FS_DURATION.observe(duration); tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, duration_seconds=duration, "fs creation completed", ); @@ -167,16 +231,15 @@ impl Actor { ports: &protocol::HashableMap, ) -> Result<()> { let timer = Instant::now(); - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "setting up oci bundle"); + tracing::info!(runner_id=?self.runner_id, "setting up oci bundle"); - let actor_path = ctx.actor_path(self.actor_id, self.generation); - let fs_path = actor_path.join("fs").join("upper"); + let runner_path = ctx.runner_path(self.runner_id); + let fs_path = runner_path.join("fs").join("upper"); let netns_path = self.netns_path(); // Read the config.json from the user-provided OCI bundle tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, "reading OCI bundle configuration", ); let oci_bundle_config_path = fs_path.join("config.json"); @@ -188,8 +251,7 @@ impl Actor { // Build env tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, "building environment variables", ); let env = user_config @@ -207,12 +269,11 @@ impl Actor { // // This config selectively uses parts from the user's OCI bundle in order to maintain security tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, "generating OCI configuration", ); let config = oci_config::config(oci_config::ConfigOpts { - actor_path: &actor_path, + runner_path: &runner_path, netns_path: &netns_path, args: user_config.process.args, env, @@ -246,21 +307,19 @@ impl Actor { // Write all files in parallel tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, "writing configuration files", ); tokio::try_join!( fs::write(oci_bundle_config_path, config_json), - fs::write(actor_path.join("resolv.conf"), resolv_conf), - fs::write(actor_path.join("hosts"), hosts_content) + fs::write(runner_path.join("resolv.conf"), resolv_conf), + fs::write(runner_path.join("hosts"), hosts_content) )?; let duration = timer.elapsed().as_secs_f64(); crate::metrics::SETUP_OCI_BUNDLE_DURATION.observe(duration); tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, duration_seconds=duration, "OCI bundle setup completed" ); @@ -268,66 +327,6 @@ impl Actor { Ok(()) } - pub async fn setup_isolate( - &self, - ctx: &Ctx, - ports: &protocol::HashableMap, - ) -> Result<()> { - let timer = Instant::now(); - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "setting up isolate environment"); - - let actor_path = ctx.actor_path(self.actor_id, self.generation); - - tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, - "generating isolate configuration" - ); - let config = actor_config::Config { - resources: actor_config::Resources { - memory: self.config.resources.memory, - memory_max: self.config.resources.memory_max, - }, - ports: ports - .iter() - .map(|(name, port)| { - ( - name.clone(), - actor_config::Port { - target: port.target, - protocol: port.protocol, - }, - ) - }) - .collect(), - env: self.build_default_env(ctx, &ports), - metadata: self.config.metadata.clone(), - vector_socket_addr: ctx.config().vector.clone().map(|x| x.address), - }; - - tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, - "writing isolate configuration" - ); - fs::write( - actor_path.join("config.json"), - &serde_json::to_vec(&config)?, - ) - .await?; - - let duration = timer.elapsed().as_secs_f64(); - crate::metrics::SETUP_ISOLATE_DURATION.observe(duration); - tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, - duration_seconds=duration, - "isolate setup completed" - ); - - Ok(()) - } - // Only ran for bridge networking pub async fn setup_cni_network( &self, @@ -335,12 +334,12 @@ impl Actor { ports: &protocol::HashableMap, ) -> Result<()> { let timer = Instant::now(); - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "setting up cni network"); + tracing::info!(runner_id=?self.runner_id, "setting up cni network"); - let actor_path = ctx.actor_path(self.actor_id, self.generation); + let runner_path = ctx.runner_path(self.runner_id); let netns_path = self.netns_path(); - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "preparing cni port mappings"); + tracing::info!(runner_id=?self.runner_id, "preparing cni port mappings"); let cni_port_mappings = ports .iter() @@ -355,20 +354,20 @@ impl Actor { // MARK: Generate CNI parameters // - // See https://github.com/actornetworking/cni/blob/b62753aa2bfa365c1ceaff6f25774a8047c896b5/cnitool/cnitool.go#L31 + // See https://github.com/containernetworking/cni/blob/b62753aa2bfa365c1ceaff6f25774a8047c896b5/cnitool/cnitool.go#L31 // See Nomad capabilities equivalent: // https://github.com/hashicorp/nomad/blob/a8f0f2612ef9d283ed903721f8453a0c0c3f51c5/client/allocrunner/networking_cni.go#L105C46-L105C46 // // See supported args: - // https://github.com/actord/go-cni/blob/6603d5bd8941d7f2026bb5627f6aa4ff434f859a/namespace_opts.go#L22 - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "generating and writing cni parameters"); + // https://github.com/containerd/go-cni/blob/6603d5bd8941d7f2026bb5627f6aa4ff434f859a/namespace_opts.go#L22 + tracing::info!(runner_id=?self.runner_id, "generating and writing cni parameters"); let cni_params = json!({ "portMappings": cni_port_mappings, }); let cni_params_json = serde_json::to_string(&cni_params)?; fs::write( - actor_path.join("cni-cap-args.json"), + runner_path.join("cni-cap-args.json"), cni_params_json.as_bytes(), ) .await?; @@ -379,7 +378,7 @@ impl Actor { // https://github.com/hashicorp/nomad/blob/a8f0f2612ef9d283ed903721f8453a0c0c3f51c5/client/allocrunner/network_manager_linux.go#L119 // Name of the network in /opt/cni/config/$NETWORK_NAME.conflist - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "creating network namespace"); + tracing::info!(runner_id=?self.runner_id, "creating network namespace"); let cni_network_name = &ctx.config().cni.network_name(); let cmd_out = Command::new("ip") @@ -396,8 +395,7 @@ impl Actor { ); tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, cni_network_name=cni_network_name, "adding network to namespace", ); @@ -421,8 +419,7 @@ impl Actor { let duration = timer.elapsed().as_secs_f64(); crate::metrics::SETUP_CNI_NETWORK_DURATION.observe(duration); tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, duration_seconds=duration, "cni network setup completed" ); @@ -435,104 +432,48 @@ impl Actor { ctx: &Ctx, ) -> Result> { let timer = Instant::now(); - tracing::info!(actor_id=?self.actor_id, generation=?self.generation, "binding ports"); + tracing::info!(runner_id=?self.runner_id, "binding ports"); - let (mut gg_ports, mut host_ports): (Vec<_>, Vec<_>) = self + let (gg_ports, host_ports): (Vec<_>, Vec<_>) = self .config .ports .iter() .partition(|(_, port)| matches!(port.routing, protocol::PortRouting::GameGuard)); tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, gg_ports_count=gg_ports.len(), host_ports_count=host_ports.len(), "partitioned ports for binding" ); - // TODO: Could combine these into one query - let (mut gg_port_rows, mut host_port_rows) = tokio::try_join!( + // TODO: Could combine these into one + let (gg_ports, host_ports) = tokio::try_join!( bind_ports_inner( ctx, - self.actor_id, - self.generation, + self.runner_id, &gg_ports, ctx.config().network.lan_port_range_min() ..=ctx.config().network.lan_port_range_max() ), bind_ports_inner( ctx, - self.actor_id, - self.generation, + self.runner_id, &host_ports, ctx.config().network.wan_port_range_min() ..=ctx.config().network.wan_port_range_max() ), )?; - tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, - "sorting ports" - ); - - // The SQL query returns a list of TCP ports then UDP ports. We sort the input ports here to match - // that order. - gg_ports.sort_by_key(|(_, port)| port.protocol); - host_ports.sort_by_key(|(_, port)| port.protocol); - // We sort the SQL results here also, just in case - gg_port_rows.sort_by_key(|(_, protocol)| *protocol); - host_port_rows.sort_by_key(|(_, protocol)| *protocol); - - tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, - "mapping proxied ports" - ); - - let proxied_ports = - gg_ports - .iter() - .zip(gg_port_rows) - .map(|((label, port), (host_port, _))| { - let host_port = host_port as u16; - - ( - (*label).clone(), - protocol::ProxiedPort { - source: host_port, - // When no target port was selected, default to randomly selected host port - target: port.target.unwrap_or(host_port), - lan_hostname: ctx.config().network.lan_hostname.clone(), - protocol: port.protocol, - }, - ) - }) - // Chain host ports - .chain(host_ports.iter().zip(host_port_rows).map( - |((label, port), (host_port, _))| { - let host_port = host_port as u16; - - ( - (*label).clone(), - protocol::ProxiedPort { - source: host_port, - // When no target port was selected, default to randomly selected host port - target: port.target.unwrap_or(host_port), - lan_hostname: ctx.config().network.lan_hostname.clone(), - protocol: port.protocol, - }, - ) - }, - )) - .collect::>(); + let proxied_ports = gg_ports + .into_iter() + .chain(host_ports.into_iter()) + .collect::>(); let duration = timer.elapsed().as_secs_f64(); crate::metrics::SETUP_BIND_PORTS_DURATION.observe(duration); tracing::info!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, duration_seconds=duration, ports_count=proxied_ports.len(), "ports binding completed" @@ -541,11 +482,115 @@ impl Actor { Ok(proxied_ports) } + pub async fn spawn_orphaned(&self, ctx: &Ctx, env: &[(&str, String)]) -> Result<()> { + { + ensure!( + self.pid + .try_read() + .context("pid should not be getting written to anywhere else")? + .is_none(), + "runner already has pid" + ); + } + + // Prepare the arguments for the runner + let runner_path = ctx.runner_path(self.runner_id); + let runner_args = vec![runner_path.to_str().context("bad path")?, self.container_id()]; + + // NOTE: Pipes are automatically closed on drop (OwnedFd) + // Pipe communication between processes + let (pipe_read, pipe_write) = pipe()?; + + // NOTE: This is why we fork the process twice: https://stackoverflow.com/a/5386753 + match unsafe { fork() }.context("process first fork failed")? { + ForkResult::Parent { child } => { + // Close the writing end of the pipe in the parent + nix::unistd::close(pipe_write)?; + + // Ensure that the child process spawned successfully + match waitpid(child, None).context("waitpid failed")? { + WaitStatus::Exited(_, 0) => { + // Read the second child's PID from the pipe + let mut buf = [0u8; 4]; + read(pipe_read, &mut buf)?; + let orphan_pid = Pid::from_raw(i32::from_le_bytes(buf)); + + *self.pid.write().await = Some(orphan_pid); + self.bump(); + + tracing::info!(runner_id=?self.runner_id, pid=?orphan_pid, "runner spawned"); + + // Update DB + utils::sql::query(|| async { + sqlx::query(indoc!( + " + UPDATE runners + SET + running_ts = ?2, + pid = ?3 + WHERE + runner_id = ?1 + ", + )) + .bind(self.runner_id) + .bind(utils::now()) + .bind(orphan_pid.as_raw()) + .execute(&mut *ctx.sql().await?) + .await + }) + .await?; + + Ok(()) + } + WaitStatus::Exited(_, status) => { + bail!("Child process exited with status {}", status) + } + _ => bail!("Unexpected wait status for child process"), + } + } + ForkResult::Child => { + // Child process + match unsafe { fork() } { + Result::Ok(ForkResult::Parent { child }) => { + // Write the second child's PID to the pipe + let orphan_pid_bytes = child.as_raw().to_le_bytes(); + write(pipe_write, &orphan_pid_bytes)?; + + // Exit the intermediate child + std::process::exit(0); + } + Result::Ok(ForkResult::Child) => { + // Disassociate from the parent by creating a new session + setsid().context("setsid failed")?; + + // Exit immediately on fail in order to not leak process + let err = std::process::Command::new( + &ctx.config().runner.container_runner_binary_path(), + ) + .args(&runner_args) + .envs(env.iter().cloned()) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .exec(); + eprintln!("exec failed: {err:?}"); + std::process::exit(1); + } + Err(err) => { + // Exit immediately in order to not leak child process + eprintln!("process second fork failed: {err:?}"); + std::process::exit(1); + } + } + } + } + } + // This function is meant to run gracefully-handled fallible steps to clean up every part of the setup // process #[tracing::instrument(skip_all)] pub async fn cleanup_setup(&self, ctx: &Ctx) { - let actor_path = ctx.actor_path(self.actor_id, self.generation); + let runner_path = ctx.runner_path(self.runner_id); let netns_path = self.netns_path(); // Clean up fs mounts @@ -579,15 +624,14 @@ impl Actor { match Command::new("umount") .arg("-dl") - .arg(actor_path.join("fs")) + .arg(runner_path.join("fs")) .output() .await { Result::Ok(cmd_out) => { if !cmd_out.status.success() { tracing::error!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, stdout=%std::str::from_utf8(&cmd_out.stdout).unwrap_or(""), stderr=%std::str::from_utf8(&cmd_out.stderr).unwrap_or(""), "failed `umount` command", @@ -596,8 +640,7 @@ impl Actor { } Err(err) => { tracing::error!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, ?err, "failed to run `umount` command", ); @@ -611,18 +654,14 @@ impl Actor { match Command::new("runc") .arg("delete") .arg("--force") - .arg(pegboard_config::utils::format_container_id( - &self.actor_id.to_string(), - self.generation, - )) + .arg(self.container_id()) .output() .await { Result::Ok(cmd_out) => { if !cmd_out.status.success() { tracing::error!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, stdout=%std::str::from_utf8(&cmd_out.stdout).unwrap_or(""), stderr=%std::str::from_utf8(&cmd_out.stderr).unwrap_or(""), "failed `runc` delete command", @@ -631,8 +670,7 @@ impl Actor { } Err(err) => { tracing::error!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, ?err, "failed to run `runc` command", ); @@ -640,7 +678,7 @@ impl Actor { } if let protocol::NetworkMode::Bridge = self.config.network_mode { - match fs::read_to_string(actor_path.join("cni-cap-args.json")).await { + match fs::read_to_string(runner_path.join("cni-cap-args.json")).await { Result::Ok(cni_params_json) => { match Command::new("cnitool") .arg("del") @@ -656,8 +694,7 @@ impl Actor { Result::Ok(cmd_out) => { if !cmd_out.status.success() { tracing::error!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, stdout=%std::str::from_utf8(&cmd_out.stdout).unwrap_or(""), stderr=%std::str::from_utf8(&cmd_out.stderr).unwrap_or(""), "failed `cnitool del` command", @@ -666,8 +703,7 @@ impl Actor { } Err(err) => { tracing::error!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, ?err, "failed to run `cnitool` command", ); @@ -676,8 +712,7 @@ impl Actor { } Err(err) => { tracing::error!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, ?err, "failed to read `cni-cap-args.json`", ); @@ -696,8 +731,7 @@ impl Actor { Result::Ok(cmd_out) => { if !cmd_out.status.success() { tracing::error!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, stdout=%std::str::from_utf8(&cmd_out.stdout).unwrap_or(""), stderr=%std::str::from_utf8(&cmd_out.stderr).unwrap_or(""), "failed `ip netns` command", @@ -706,8 +740,7 @@ impl Actor { } Err(err) => { tracing::error!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, ?err, "failed to run `ip` command", ); @@ -715,8 +748,7 @@ impl Actor { } } else { tracing::error!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, ?netns_path, "invalid netns path", ); @@ -729,14 +761,13 @@ impl Actor { // Allow time for vector to pick up logs before they are deleted tokio::time::sleep(std::time::Duration::from_secs(5)).await; - // Delete entire actor dir. Note that for actors using KV storage, it is persisted elsewhere and will + // Delete entire runner dir. Note that for actors using KV storage, it is persisted elsewhere and will // not be deleted by this (see `persist_storage` in the runner protocol). - if let Err(err) = tokio::fs::remove_dir_all(&actor_path).await { + if let Err(err) = tokio::fs::remove_dir_all(&runner_path).await { tracing::error!( - actor_id=?self.actor_id, - generation=?self.generation, + runner_id=?self.runner_id, ?err, - "failed to delete actor dir", + "failed to delete runner dir", ); } } @@ -748,7 +779,7 @@ impl Actor { Path::new("/proc/1/ns/net").to_path_buf() } else { // CNI network that will be created - Path::new("/var/run/netns").join(format!("{}-{}", self.actor_id, self.generation)) + Path::new("/var/run/netns").join(self.runner_id.to_string()) } } @@ -768,10 +799,20 @@ impl Actor { port.target.to_string(), ) })) - .chain([( - "RIVET_API_ENDPOINT".to_string(), - ctx.config().cluster.api_endpoint.to_string(), - )]) + .chain([ + ( + "RIVET_MANAGER_IP".to_string(), + ctx.config().runner.ip().to_string(), + ), + ( + "RIVET_MANAGER_PORT".to_string(), + ctx.config().runner.port().to_string(), + ), + ( + "RIVET_API_ENDPOINT".to_string(), + ctx.config().cluster.api_endpoint.to_string(), + ), + ]) .collect() } } @@ -794,108 +835,92 @@ fn build_hosts_content(ctx: &Ctx) -> String { async fn bind_ports_inner( ctx: &Ctx, - actor_id: Uuid, - generation: u32, + runner_id: Uuid, ports: &[(&String, &protocol::Port)], range: std::ops::RangeInclusive, -) -> Result> { +) -> Result, Error> { if ports.is_empty() { return Ok(Vec::new()); } - let mut tcp_count = 0; - let mut udp_count = 0; + // Compute the “modulus” for wrapping + let truncated_max = (range.end() - range.start()) as i64; - // Count ports - for (_, port) in ports { - match port.protocol { - protocol::TransportProtocol::Tcp => tcp_count += 1, - protocol::TransportProtocol::Udp => udp_count += 1, - } - } + // Pick one random start‐offset for each protocol + let mut tcp_cur = rand::thread_rng().gen_range(0..=truncated_max); + let mut udp_cur = rand::thread_rng().gen_range(0..=truncated_max); - let truncated_max = range.end() - range.start(); - // Add random spread to port selection - let tcp_offset = rand::thread_rng().gen_range(0..truncated_max); - let udp_offset = rand::thread_rng().gen_range(0..truncated_max); + let mut conn = ctx.sql().await?; + let mut tx = conn.begin().await?; + let mut bound = Vec::with_capacity(ports.len()); - // Selects available TCP and UDP ports - let rows = utils::sql::query(|| async { - sqlx::query_as::<_, (i64, i64)>(indoc!( + for (label, port) in ports { + let cur_offset = match port.protocol { + protocol::TransportProtocol::Tcp => &mut tcp_cur, + protocol::TransportProtocol::Udp => &mut udp_cur, + }; + + let row = sqlx::query_as::<_, (i64,)>(indoc!( " - INSERT INTO actor_ports (actor_id, generation, port, protocol) - -- Select TCP ports - SELECT ?1, ?2, port, protocol + INSERT INTO runner_ports (runner_id, label, source, target, protocol) + SELECT ?1, ?2, port, ?3, ?4 FROM ( WITH RECURSIVE nums(n, i) AS ( SELECT ?5, ?5 UNION ALL - SELECT (n + 1) % (?8 + 1), i + 1 - FROM nums - WHERE i < ?8 + ?5 - ), - available_ports(port) AS ( - SELECT nums.n + ?7 - FROM nums - LEFT JOIN actor_ports AS p - ON - nums.n + ?7 = p.port AND - p.protocol = 0 AND - delete_ts IS NULL - WHERE - p.port IS NULL OR - delete_ts IS NOT NULL - LIMIT ?3 - ) - SELECT port, 0 AS protocol FROM available_ports - ) - UNION ALL - -- Select UDP ports - SELECT ?1, ?2, port, protocol - FROM ( - WITH RECURSIVE - nums(n, i) AS ( - SELECT ?6, ?6 - UNION ALL - SELECT (n + 1) % (?8 + 1), i + 1 + SELECT (n + 1) % (?6 + 1), i + 1 FROM nums - WHERE i < ?8 + ?6 + WHERE i < ?6 + ?7 ), available_ports(port) AS ( SELECT nums.n + ?7 FROM nums - LEFT JOIN actor_ports AS p + LEFT JOIN runner_ports AS p ON - nums.n + ?7 = p.port AND - p.protocol = 1 AND - delete_ts IS NULL - WHERE - p.port IS NULL OR - delete_ts IS NOT NULL - LIMIT ?4 + (nums.n + ?7) = p.source AND + p.protocol = ?4 AND + p.delete_ts IS NULL + WHERE p.source IS NULL OR p.delete_ts IS NOT NULL + LIMIT 1 ) - SELECT port, 1 AS protocol FROM available_ports + SELECT port FROM available_ports ) - RETURNING port, protocol - ", + RETURNING source + " )) - .bind(actor_id) - .bind(generation as i64) - .bind(tcp_count as i64) // ?3 - .bind(udp_count as i64) // ?4 - .bind(tcp_offset as i64) // ?5 - .bind(udp_offset as i64) // ?6 - .bind(*range.start() as i64) // ?7 - .bind(truncated_max as i64) // ?8 - .fetch_all(&mut *ctx.sql().await?) - .await - }) - .await?; - - if rows.len() != tcp_count + udp_count { - bail!("not enough available ports"); + .bind(runner_id) // ?1 + .bind(label) // ?2 + .bind(port.target) // ?3 + .bind(port.protocol as i64) // ?4 + .bind(*cur_offset) // ?5: starting n for this protocol + .bind(truncated_max) // ?6: modulus (range size) + .bind(*range.start() as i64) // ?7: minimum port value + .fetch_optional(&mut *tx) + .await?; + + let Some((source,)) = row else { + bail!("not enough available ports"); + }; + + let host_port = source.try_into()?; + + bound.push(( + label.to_string(), + protocol::ProxiedPort { + source: host_port, + // When no target port was selected, default to randomly selected host port + target: port.target.unwrap_or(host_port), + lan_hostname: ctx.config().network.lan_hostname.clone(), + protocol: port.protocol, + }, + )); + + // bump the offset so next same‐protocol allocation starts from the next number + *cur_offset = (*cur_offset + 1) % (truncated_max + 1); } - Ok(rows) + tx.commit().await?; + + Ok(bound) } diff --git a/packages/edge/infra/client/manager/src/utils/mod.rs b/packages/edge/infra/client/manager/src/utils/mod.rs index 845f9eebe0..35456c91da 100644 --- a/packages/edge/infra/client/manager/src/utils/mod.rs +++ b/packages/edge/infra/client/manager/src/utils/mod.rs @@ -10,7 +10,6 @@ use notify::{ event::{AccessKind, AccessMode}, Event, EventKind, RecommendedWatcher, RecursiveMode, Watcher, }; -use pegboard::protocol; use pegboard_config::Config; use sql::SqlitePoolExt; use sqlx::{ @@ -36,10 +35,9 @@ pub async fn init_dir(config: &Config) -> Result<()> { bail!("data dir `{}` does not exist", data_dir.display()); } - if config.client.runner.flavor == protocol::ClientFlavor::Container - && fs::metadata(&config.client.runner.container_runner_binary_path()) - .await - .is_err() + if fs::metadata(&config.client.runner.container_runner_binary_path()) + .await + .is_err() { bail!( "container runner binary `{}` does not exist", @@ -51,21 +49,10 @@ pub async fn init_dir(config: &Config) -> Result<()> { ); } - if config.client.runner.flavor == protocol::ClientFlavor::Isolate - && fs::metadata(&config.client.runner.isolate_runner_binary_path()) - .await - .is_err() - { - bail!( - "isolate runner binary `{}` does not exist", - config.client.runner.isolate_runner_binary_path().display() - ); - } - - // Create actors dir - match fs::create_dir(data_dir.join("actors")).await { + // Create runners dir + match fs::create_dir(data_dir.join("runners")).await { Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {} - x => x.context("failed to create /actors dir in data dir")?, + x => x.context("failed to create /runners dir in data dir")?, } // Create images dir @@ -74,12 +61,6 @@ pub async fn init_dir(config: &Config) -> Result<()> { x => x.context("failed to create /images dir in data dir")?, } - // Create runner dir - match fs::create_dir(data_dir.join("runner")).await { - Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {} - x => x.context("failed to create /runner dir in data dir")?, - } - // Create db dir match fs::create_dir(data_dir.join("db")).await { Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {} @@ -145,8 +126,6 @@ async fn init_sqlite_schema(pool: &SqlitePool) -> Result<()> { last_command_idx INTEGER NOT NULL, last_workflow_id BLOB, -- UUID - isolate_runner_pid INTEGER, - -- Keeps this table having one row _persistence INTEGER UNIQUE NOT NULL DEFAULT TRUE -- BOOLEAN ) STRICT @@ -189,6 +168,27 @@ async fn init_sqlite_schema(pool: &SqlitePool) -> Result<()> { .execute(&mut *conn) .await?; + sqlx::query(indoc!( + " + CREATE TABLE IF NOT EXISTS runners ( + runner_id BLOB NOT NULL, -- UUID + comms INTEGER NOT NULL, -- runner::setup::Comms + config BLOB NOT NULL, + + start_ts INTEGER NOT NULL, + running_ts INTEGER, + exit_ts INTEGER, + + pid INTEGER, + exit_code INTEGER, + + PRIMARY KEY (runner_id) + ) STRICT + ", + )) + .execute(&mut *conn) + .await?; + sqlx::query(indoc!( " CREATE TABLE IF NOT EXISTS images_cache ( @@ -217,7 +217,6 @@ async fn init_sqlite_schema(pool: &SqlitePool) -> Result<()> { stop_ts INTEGER, exit_ts INTEGER, - pid INTEGER, exit_code INTEGER, -- Also exists in the config column but this is for indexing @@ -242,10 +241,11 @@ async fn init_sqlite_schema(pool: &SqlitePool) -> Result<()> { sqlx::query(indoc!( " - CREATE TABLE IF NOT EXISTS actor_ports ( - actor_id BLOB NOT NULL, -- UUID - generation INT NOT NULL, - port INT NOT NULL, + CREATE TABLE IF NOT EXISTS runner_ports ( + runner_id BLOB NOT NULL, -- UUID + label TEXT NOT NULL, + source INT NOT NULL, + target INT, protocol INT NOT NULL, -- protocol::TransportProtocol delete_ts INT @@ -257,8 +257,8 @@ async fn init_sqlite_schema(pool: &SqlitePool) -> Result<()> { sqlx::query(indoc!( " - CREATE INDEX IF NOT EXISTS actor_ports_id_idx - ON actor_ports(actor_id, generation) + CREATE INDEX IF NOT EXISTS runner_ports_id_idx + ON runner_ports(runner_id) ", )) .execute(&mut *conn) @@ -266,8 +266,8 @@ async fn init_sqlite_schema(pool: &SqlitePool) -> Result<()> { sqlx::query(indoc!( " - CREATE UNIQUE INDEX IF NOT EXISTS actor_ports_unique_idx - ON actor_ports(port, protocol) + CREATE UNIQUE INDEX IF NOT EXISTS runner_ports_source_unique_idx + ON runner_ports(source, protocol) WHERE delete_ts IS NULL ", )) diff --git a/packages/edge/infra/client/manager/tests/common.rs b/packages/edge/infra/client/manager/tests/common.rs index 28dd45aed3..572bbec314 100644 --- a/packages/edge/infra/client/manager/tests/common.rs +++ b/packages/edge/infra/client/manager/tests/common.rs @@ -73,34 +73,43 @@ pub async fn start_echo_actor( actor_id, generation: 0, config: Box::new(protocol::ActorConfig { - image: protocol::Image { - id: Uuid::nil(), - artifact_url_stub: "/image".into(), - fallback_artifact_url: None, - kind: protocol::ImageKind::DockerImage, - compression: protocol::ImageCompression::None, - }, - root_user_enabled: false, - env: [("foo".to_string(), "bar".to_string())] + runner: Some(protocol::ActorRunner::New { + runner_id: Uuid::nil(), + config: protocol::RunnerConfig { + image: protocol::Image { + id: Uuid::nil(), + artifact_url_stub: "/image".into(), + fallback_artifact_url: None, + kind: protocol::ImageKind::DockerImage, + compression: protocol::ImageCompression::None, + allocation_type: protocol::ImageAllocationType::Multi, + }, + root_user_enabled: false, + resources: protocol::Resources { + cpu: 100, + memory: 10 * 1024 * 1024, + memory_max: 15 * 1024 * 1024, + disk: 15, + }, + env: [("BAR".to_string(), "foo".to_string())] + .into_iter() + .collect(), + ports: [( + "main".to_string(), + protocol::Port { + target: None, + protocol: protocol::TransportProtocol::Tcp, + routing: protocol::PortRouting::Host, + }, + )] + .into_iter() + .collect(), + network_mode: protocol::NetworkMode::Host, + }, + }), + env: [("FOO".to_string(), "bar".to_string())] .into_iter() .collect(), - ports: [( - "main".to_string(), - protocol::Port { - target: None, - protocol: protocol::TransportProtocol::Tcp, - routing: protocol::PortRouting::Host, - }, - )] - .into_iter() - .collect(), - network_mode: protocol::NetworkMode::Host, - resources: protocol::Resources { - cpu: 100, - memory: 10 * 1024 * 1024, - memory_max: 15 * 1024 * 1024, - disk: 15, - }, metadata: protocol::Raw::new(&protocol::ActorMetadata { actor: protocol::ActorMetadataActor { actor_id, @@ -130,29 +139,26 @@ pub async fn start_echo_actor( network: None, }) .unwrap(), - }), - }; - send_command(tx, cmd).await; -} - -pub async fn start_js_echo_actor( - tx: &mut SplitSink, Message>, - actor_id: Uuid, -) { - let cmd = protocol::Command::StartActor { - actor_id, - generation: 0, - config: Box::new(protocol::ActorConfig { + // Deprecated image: protocol::Image { id: Uuid::nil(), - artifact_url_stub: "/js-image".into(), + artifact_url_stub: "/image".into(), fallback_artifact_url: None, - kind: protocol::ImageKind::JavaScript, + kind: protocol::ImageKind::DockerImage, compression: protocol::ImageCompression::None, + allocation_type: protocol::ImageAllocationType::Multi, }, root_user_enabled: false, - env: Default::default(), + env: [("BAR".to_string(), "foo".to_string())] + .into_iter() + .collect(), + resources: protocol::Resources { + cpu: 100, + memory: 10 * 1024 * 1024, + memory_max: 15 * 1024 * 1024, + disk: 15, + }, ports: [( "main".to_string(), protocol::Port { @@ -164,41 +170,6 @@ pub async fn start_js_echo_actor( .into_iter() .collect(), network_mode: protocol::NetworkMode::Host, - resources: protocol::Resources { - cpu: 100, - memory: 10 * 1024 * 1024, - memory_max: 15 * 1024 * 1024, - disk: 15, - }, - metadata: protocol::Raw::new(&protocol::ActorMetadata { - actor: protocol::ActorMetadataActor { - actor_id, - tags: [("foo".to_string(), "bar".to_string())] - .into_iter() - .collect(), - create_ts: 0, - }, - project: protocol::ActorMetadataProject { - project_id: Uuid::nil(), - slug: "foo".to_string(), - }, - environment: protocol::ActorMetadataEnvironment { - env_id: Uuid::nil(), - slug: "foo".to_string(), - }, - datacenter: protocol::ActorMetadataDatacenter { - name_id: "local".to_string(), - display_name: "Local".to_string(), - }, - cluster: protocol::ActorMetadataCluster { - cluster_id: Uuid::nil(), - }, - build: protocol::ActorMetadataBuild { - build_id: Uuid::nil(), - }, - network: None, - }) - .unwrap(), }), }; @@ -232,7 +203,6 @@ pub fn start_server( pub async fn init_client(gen_path: &Path, working_path: &Path) -> Config { let container_runner_binary_path = working_path.join("bin").join("container-runner"); - let isolate_runner_binary_path = working_path.join("bin").join("isolate-runner"); tokio::fs::create_dir(working_path.join("bin")) .await @@ -245,12 +215,6 @@ pub async fn init_client(gen_path: &Path, working_path: &Path) -> Config { ) .await .unwrap(); - tokio::fs::copy( - isolate_v8_runner_path(gen_path), - &isolate_runner_binary_path, - ) - .await - .unwrap(); let config = Config { client: Client { @@ -262,12 +226,10 @@ pub async fn init_client(gen_path: &Path, working_path: &Path) -> Config { api_endpoint: Url::parse("http://127.0.0.1").unwrap(), }, runner: Runner { - // Not necessary for the test - flavor: protocol::ClientFlavor::Container, + ip: None, port: None, use_mounts: Some(true), container_runner_binary_path: Some(container_runner_binary_path), - isolate_runner_binary_path: Some(isolate_runner_binary_path), }, images: Images { max_cache_size: None, @@ -386,21 +348,7 @@ pub async fn build_binaries(gen_path: &Path) { assert!(status.success()); - // Js image - let status = Command::new("tar") - .arg("-cf") - .arg(js_image_path(gen_path)) - .arg("-C") - .arg(Path::new(env!("CARGO_MANIFEST_DIR")).join("tests")) - .arg("index.js") - .status() - .await - .unwrap(); - - assert!(status.success()); - build_runner(gen_path, "container").await; - build_runner(gen_path, "isolate-v8").await; } async fn build_runner(gen_path: &Path, variant: &str) { @@ -449,11 +397,7 @@ async fn build_runner(gen_path: &Path, variant: &str) { let status = Command::new("docker") .arg("cp") .arg(format!("{}:{}", container_name, binary_path_in_container)) - .arg(if variant == "container" { - container_runner_path(gen_path) - } else { - isolate_v8_runner_path(gen_path) - }) + .arg(container_runner_path(gen_path)) .status() .await .expect("Failed to copy binary from container"); @@ -637,10 +581,6 @@ pub fn container_runner_path(gen_path: &Path) -> PathBuf { gen_path.join("pegboard-container-runner").to_path_buf() } -pub fn isolate_v8_runner_path(gen_path: &Path) -> PathBuf { - gen_path.join("pegboard-isolate-v8-runner").to_path_buf() -} - pub fn image_path(gen_path: &Path) -> PathBuf { gen_path.join("pegboard-echo-server").to_path_buf() } diff --git a/packages/edge/infra/client/manager/tests/index.js b/packages/edge/infra/client/manager/tests/index.js deleted file mode 100644 index 8b97c5c57d..0000000000 --- a/packages/edge/infra/client/manager/tests/index.js +++ /dev/null @@ -1,23 +0,0 @@ -export default { - async start(ctx) { - console.log(Deno.env.toObject()); - - console.log(ctx.metadata); - - const server = Deno.serve({ - handler, - port: Number.parseInt(Deno.env.get("PORT_MAIN")), - }); - - await server.finished; - }, -}; - -function handler(req) { - console.log("req"); - - return new Response(req.body, { - status: 200, - headers: { "Content-Type": "application/json" }, - }); -} diff --git a/packages/edge/infra/client/manager/tests/isolate_lifecycle.rs b/packages/edge/infra/client/manager/tests/isolate_lifecycle.rs deleted file mode 100644 index 6722765d86..0000000000 --- a/packages/edge/infra/client/manager/tests/isolate_lifecycle.rs +++ /dev/null @@ -1,216 +0,0 @@ -// NOTE: Requires installing skopeo and umoci on the machine running this test - -use std::{sync::Arc, time::Duration}; - -use futures_util::StreamExt; -use nix::sys::signal::Signal; -use pegboard::protocol; -use pegboard_manager::Ctx; -use tokio::{net::TcpStream, sync::Mutex}; -use tokio_tungstenite::tungstenite::protocol::Message; -use uuid::Uuid; - -mod common; -use common::*; - -#[derive(Debug)] -enum State { - None, - Starting, - Running, - Stopped, - Exited, -} - -/// Goes through all actor states and commands. -#[tokio::test(flavor = "multi_thread")] -async fn isolate_lifecycle() { - setup_tracing(); - - tracing::info!("starting test"); - - let (_gen_tmp_dir, gen_tmp_dir_path) = setup_dependencies().await; - - let ctx_wrapper: Arc>>> = Arc::new(Mutex::new(None)); - let (close_tx, close_rx) = tokio::sync::watch::channel(()); - let close_tx = Arc::new(close_tx); - - let port = portpicker::pick_unused_port().expect("no free ports"); - start_server(ctx_wrapper.clone(), close_tx, port, handle_connection); - - // Init project directories - let tmp_dir = tempfile::TempDir::new().unwrap(); - let path = tmp_dir.path(); - // let path = std::path::Path::new( - // "/home/rivet/rivet-ee/oss/packages/edge/infra/client/manager/tests/foo", - // ); - - let config = init_client(&gen_tmp_dir_path, &path).await; - tracing::info!(path=%path.display(), "client dir"); - - start_client(config, ctx_wrapper, close_rx.clone(), port).await; -} - -async fn handle_connection( - ctx_wrapper: Arc>>>, - close_tx: Arc>, - raw_stream: TcpStream, -) { - tokio::spawn(async move { - let ws_stream = tokio_tungstenite::accept_async(raw_stream).await.unwrap(); - let (mut tx, mut rx) = ws_stream.split(); - - tokio::time::sleep(std::time::Duration::from_millis(16)).await; - - // Read ctx from wrapper - let ctx = { - let guard = ctx_wrapper.lock().await; - guard.clone().unwrap() - }; - - let actor_id = Uuid::new_v4(); - let mut actor_state = State::None; - - // Receive messages from socket - while let Some(msg) = rx.next().await { - match msg.unwrap() { - Message::Binary(buf) => { - let protocol_version = 1; - let packet = protocol::ToServer::deserialize(protocol_version, &buf).unwrap(); - - match packet { - protocol::ToServer::Init { .. } => { - send_init_packet(&mut tx).await; - - start_js_echo_actor(&mut tx, actor_id).await; - } - protocol::ToServer::Events(events) => { - for event in events { - tracing::info!(?event, "received event"); - - let protocol::Event::ActorStateUpdate { state, .. } = - event.inner.deserialize().unwrap(); - - match state { - protocol::ActorState::Starting => { - if let State::None = actor_state { - actor_state = State::Starting; - } else { - panic!( - "invalid prior state: {actor_state:?} -> {state:?}" - ); - } - - // Verify client state - let actors = ctx.actors().read().await; - assert!( - actors.contains_key(&(actor_id, 0)), - "actor not in client memory" - ); - } - protocol::ActorState::Running { ref ports, .. } => { - let port = ports.values().next().unwrap().source; - - if let State::Starting = actor_state { - actor_state = State::Running; - } else { - panic!( - "invalid prior state: {actor_state:?} -> {state:?}" - ); - } - - // Verify client state - let actors = ctx.actors().read().await; - assert!( - actors.contains_key(&(actor_id, 0)), - "actor not in client memory" - ); - - tokio::time::sleep(std::time::Duration::from_millis(1000)) - .await; - - tracing::info!("sending echo"); - - // Send echo test - let req = b"hello world"; - let res = reqwest::Client::new() - .post(format!("http://0.0.0.0:{port}")) - .body(req.to_vec()) - .send() - .await - .unwrap() - .error_for_status() - .unwrap() - .bytes() - .await - .unwrap(); - assert_eq!(req, &res[..], "echo failed"); - - tracing::info!("echo success"); - - // Stop actor - send_command( - &mut tx, - protocol::Command::SignalActor { - actor_id, - generation: 0, - signal: Signal::SIGKILL as i32, - persist_storage: false, - }, - ) - .await; - } - protocol::ActorState::Stopped => { - if let State::Running = actor_state { - actor_state = State::Stopped; - } else { - panic!( - "invalid prior state: {actor_state:?} -> {state:?}" - ); - } - - // Verify client state - let actors = ctx.actors().read().await; - assert!( - actors.contains_key(&(actor_id, 0)), - "actor not in client memory" - ); - } - protocol::ActorState::Exited { .. } => { - if let State::Stopped = actor_state { - actor_state = State::Exited; - } else { - panic!( - "invalid prior state: {actor_state:?} -> {state:?}" - ); - } - - tokio::time::sleep(Duration::from_millis(50)).await; - - // Verify client state - let actors = ctx.actors().read().await; - assert!( - !actors.contains_key(&(actor_id, 0)), - "actor still in client memory" - ); - - // Test complete - close_tx.send(()).unwrap(); - } - state => panic!("unexpected state received: {state:?}"), - } - } - } - protocol::ToServer::AckCommands { .. } => {} - } - } - Message::Close(_) => { - panic!("socket closed"); - } - _ => {} - } - } - - tracing::info!("client disconnected"); - }); -} diff --git a/packages/edge/infra/client/manager/tests/vector.json b/packages/edge/infra/client/manager/tests/vector.json index 8c2cdc760a..730cbc0cf8 100644 --- a/packages/edge/infra/client/manager/tests/vector.json +++ b/packages/edge/infra/client/manager/tests/vector.json @@ -18,7 +18,7 @@ } }, "transforms": { - "actors": { + "runners": { "type": "filter", "inputs": [ "vector", @@ -26,19 +26,19 @@ ], "condition": { "type": "vrl", - "source": ".source == \"actors\"" + "source": ".source == \"runners\"" } }, "add_prefix": { "type": "remap", "inputs": [ - "actors" + "runners" ], - "source": ".message, err = \"\u001b[2m\" + \"actor_id=\" + .actor_id + \"\u001b[0m \" + .message" + "source": ".message, err = \"\u001b[2m\" + \"runner_id=\" + .runner_id + \"\u001b[0m \" + .message" } }, "sinks": { - "actor_logs": { + "runner_logs": { "type": "console", "inputs": [ "add_prefix" diff --git a/packages/edge/services/pegboard/src/keys/datacenter.rs b/packages/edge/services/pegboard/src/keys/datacenter.rs index d46f0910b4..789d4a8f63 100644 --- a/packages/edge/services/pegboard/src/keys/datacenter.rs +++ b/packages/edge/services/pegboard/src/keys/datacenter.rs @@ -3,6 +3,7 @@ use std::result::Result::Ok; use anyhow::*; use chirp_workflow::prelude::*; use fdb_util::prelude::*; +use serde::{Deserialize, Serialize}; use crate::protocol; @@ -146,3 +147,121 @@ impl TuplePack for ClientsByRemainingMemSubspaceKey { Ok(offset) } } + +#[derive(Debug)] +pub struct RunnersByRemainingSlotsKey { + pub build_id: Uuid, + pub remaining_slots: u64, + pub runner_id: Uuid, +} + +impl RunnersByRemainingSlotsKey { + pub fn new(build_id: Uuid, remaining_slots: u64, runner_id: Uuid) -> Self { + RunnersByRemainingSlotsKey { + build_id, + remaining_slots, + runner_id, + } + } + + pub fn subspace(build_id: Uuid) -> RunnersByRemainingSlotsSubspaceKey { + RunnersByRemainingSlotsSubspaceKey::new(build_id) + } + + pub fn subspace_with_slots( + build_id: Uuid, + remaining_slots: u64, + ) -> RunnersByRemainingSlotsSubspaceKey { + RunnersByRemainingSlotsSubspaceKey::new_with_slots(build_id, remaining_slots) + } +} + +impl FormalKey for RunnersByRemainingSlotsKey { + type Value = RunnersByRemainingSlotsKeyData; + + fn deserialize(&self, raw: &[u8]) -> Result { + serde_json::from_slice(raw).map_err(Into::into) + } + + fn serialize(&self, value: Self::Value) -> Result> { + serde_json::to_vec(&value).map_err(Into::into) + } +} + +impl TuplePack for RunnersByRemainingSlotsKey { + fn pack( + &self, + w: &mut W, + tuple_depth: TupleDepth, + ) -> std::io::Result { + let t = ( + DATACENTER, + RUNNERS_BY_REMAINING_SLOTS, + self.build_id, + self.remaining_slots, + self.runner_id, + ); + t.pack(w, tuple_depth) + } +} + +impl<'de> TupleUnpack<'de> for RunnersByRemainingSlotsKey { + fn unpack(input: &[u8], tuple_depth: TupleDepth) -> PackResult<(&[u8], Self)> { + let (input, (_, _, build_id, remaining_slots, runner_id)) = + <(usize, usize, Uuid, u64, Uuid)>::unpack(input, tuple_depth)?; + + let v = RunnersByRemainingSlotsKey { + build_id, + remaining_slots, + runner_id, + }; + + Ok((input, v)) + } +} + +#[derive(Debug, Deserialize, Serialize)] +pub struct RunnersByRemainingSlotsKeyData { + pub client_id: Uuid, + pub client_workflow_id: Uuid, +} + +pub struct RunnersByRemainingSlotsSubspaceKey { + pub build_id: Uuid, + pub remaining_slots: Option, +} + +impl RunnersByRemainingSlotsSubspaceKey { + pub fn new(build_id: Uuid) -> Self { + RunnersByRemainingSlotsSubspaceKey { + build_id, + remaining_slots: None, + } + } + + pub fn new_with_slots(build_id: Uuid, remaining_slots: u64) -> Self { + RunnersByRemainingSlotsSubspaceKey { + build_id, + remaining_slots: Some(remaining_slots), + } + } +} + +impl TuplePack for RunnersByRemainingSlotsSubspaceKey { + fn pack( + &self, + w: &mut W, + tuple_depth: TupleDepth, + ) -> std::io::Result { + let mut offset = VersionstampOffset::None { size: 0 }; + + let t = (DATACENTER, RUNNERS_BY_REMAINING_SLOTS, self.build_id); + offset += t.pack(w, tuple_depth)?; + + if let Some(remaining_slots) = &self.remaining_slots { + offset += remaining_slots.pack(w, tuple_depth)?; + } + + Ok(offset) + } +} diff --git a/packages/edge/services/pegboard/src/keys/mod.rs b/packages/edge/services/pegboard/src/keys/mod.rs index 392523ef3c..0c0d3a6011 100644 --- a/packages/edge/services/pegboard/src/keys/mod.rs +++ b/packages/edge/services/pegboard/src/keys/mod.rs @@ -5,6 +5,7 @@ pub mod client; pub mod datacenter; pub mod env; pub mod port; +pub mod runner; pub fn subspace() -> fdb_util::Subspace { fdb_util::Subspace::new(&(RIVET, PEGBOARD)) diff --git a/packages/edge/services/pegboard/src/keys/runner.rs b/packages/edge/services/pegboard/src/keys/runner.rs new file mode 100644 index 0000000000..408b0cbede --- /dev/null +++ b/packages/edge/services/pegboard/src/keys/runner.rs @@ -0,0 +1,140 @@ +use std::result::Result::Ok; + +use anyhow::*; +use chirp_workflow::prelude::*; +use fdb_util::prelude::*; + +#[derive(Debug)] +pub struct ImageIdKey { + runner_id: Uuid, +} + +impl ImageIdKey { + pub fn new(runner_id: Uuid) -> Self { + ImageIdKey { runner_id } + } +} + +impl FormalKey for ImageIdKey { + /// Image ID. + type Value = Uuid; + + fn deserialize(&self, raw: &[u8]) -> Result { + Ok(Uuid::from_slice(raw)?) + } + + fn serialize(&self, value: Self::Value) -> Result> { + Ok(value.as_bytes().to_vec()) + } +} + +impl TuplePack for ImageIdKey { + fn pack( + &self, + w: &mut W, + tuple_depth: TupleDepth, + ) -> std::io::Result { + let t = (RUNNER, DATA, self.runner_id, IMAGE_ID); + t.pack(w, tuple_depth) + } +} + +impl<'de> TupleUnpack<'de> for ImageIdKey { + fn unpack(input: &[u8], tuple_depth: TupleDepth) -> PackResult<(&[u8], Self)> { + let (input, (_, _, runner_id, _)) = + <(usize, usize, Uuid, usize)>::unpack(input, tuple_depth)?; + let v = ImageIdKey { runner_id }; + + Ok((input, v)) + } +} + +#[derive(Debug)] +pub struct RemainingSlotsKey { + runner_id: Uuid, +} + +impl RemainingSlotsKey { + pub fn new(runner_id: Uuid) -> Self { + RemainingSlotsKey { runner_id } + } +} + +impl FormalKey for RemainingSlotsKey { + /// Slots. + type Value = u64; + + fn deserialize(&self, raw: &[u8]) -> Result { + Ok(u64::from_be_bytes(raw.try_into()?)) + } + + fn serialize(&self, value: Self::Value) -> Result> { + Ok(value.to_be_bytes().to_vec()) + } +} + +impl TuplePack for RemainingSlotsKey { + fn pack( + &self, + w: &mut W, + tuple_depth: TupleDepth, + ) -> std::io::Result { + let t = (RUNNER, DATA, self.runner_id, REMAINING_SLOTS); + t.pack(w, tuple_depth) + } +} + +impl<'de> TupleUnpack<'de> for RemainingSlotsKey { + fn unpack(input: &[u8], tuple_depth: TupleDepth) -> PackResult<(&[u8], Self)> { + let (input, (_, _, runner_id, _)) = + <(usize, usize, Uuid, usize)>::unpack(input, tuple_depth)?; + let v = RemainingSlotsKey { runner_id }; + + Ok((input, v)) + } +} + +#[derive(Debug)] +pub struct TotalSlotsKey { + runner_id: Uuid, +} + +impl TotalSlotsKey { + pub fn new(runner_id: Uuid) -> Self { + TotalSlotsKey { runner_id } + } +} + +impl FormalKey for TotalSlotsKey { + /// MiB. + type Value = u64; + + fn deserialize(&self, raw: &[u8]) -> Result { + Ok(u64::from_be_bytes(raw.try_into()?)) + } + + fn serialize(&self, value: Self::Value) -> Result> { + Ok(value.to_be_bytes().to_vec()) + } +} + +impl TuplePack for TotalSlotsKey { + fn pack( + &self, + w: &mut W, + tuple_depth: TupleDepth, + ) -> std::io::Result { + let t = (RUNNER, DATA, self.runner_id, TOTAL_SLOTS); + t.pack(w, tuple_depth) + } +} + +impl<'de> TupleUnpack<'de> for TotalSlotsKey { + fn unpack(input: &[u8], tuple_depth: TupleDepth) -> PackResult<(&[u8], Self)> { + let (input, (_, _, runner_id, _)) = + <(usize, usize, Uuid, usize)>::unpack(input, tuple_depth)?; + let v = TotalSlotsKey { runner_id }; + + Ok((input, v)) + } +} diff --git a/packages/edge/services/pegboard/src/lib.rs b/packages/edge/services/pegboard/src/lib.rs index a94c96c30e..6464b2400f 100644 --- a/packages/edge/services/pegboard/src/lib.rs +++ b/packages/edge/services/pegboard/src/lib.rs @@ -21,6 +21,7 @@ pub fn registry() -> WorkflowResult { let mut registry = Registry::new(); registry.register_workflow::()?; registry.register_workflow::()?; + registry.register_workflow::()?; Ok(registry) } diff --git a/packages/edge/services/pegboard/src/protocol.rs b/packages/edge/services/pegboard/src/protocol.rs index fb13aa1530..61ee4927c6 100644 --- a/packages/edge/services/pegboard/src/protocol.rs +++ b/packages/edge/services/pegboard/src/protocol.rs @@ -97,17 +97,30 @@ pub enum Command { /// Whether or not to delete related data (KV store). persist_storage: bool, }, + SignalRunner { + runner_id: Uuid, + // See nix::sys::signal::Signal + signal: i32, + }, } #[derive(Debug, Serialize, Deserialize, Clone, Hash)] pub struct ActorConfig { + // TODO: Once old actors are all gone, make this not optional + pub runner: Option, + pub env: HashableMap, + pub metadata: Raw, + + #[deprecated] pub image: Image, + #[deprecated] pub root_user_enabled: bool, + #[deprecated] pub resources: Resources, - pub env: HashableMap, + #[deprecated] pub ports: HashableMap, + #[deprecated] pub network_mode: NetworkMode, - pub metadata: Raw, } #[derive(Debug, Serialize, Deserialize, Clone, Hash)] @@ -122,6 +135,7 @@ pub struct Image { pub artifact_size_bytes: u64, pub kind: ImageKind, pub compression: ImageCompression, + pub allocation_type: ImageAllocationType, } #[derive(Serialize, Deserialize, Hash, Debug, Clone, Copy, PartialEq, Eq)] @@ -142,13 +156,11 @@ impl From for ImageKind { } } -impl ImageKind { - pub fn client_flavor(&self) -> ClientFlavor { - match self { - ImageKind::DockerImage | ImageKind::OciBundle => ClientFlavor::Container, - ImageKind::JavaScript => ClientFlavor::Isolate, - } - } +#[derive(Serialize, Deserialize, Hash, Debug, Clone, Copy, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ImageAllocationType { + Single, + Multi, } #[derive(Serialize, Deserialize, Hash, Debug, Clone, Copy, PartialEq, Eq)] @@ -272,6 +284,36 @@ pub struct ActorMetadataBuild { pub build_id: Uuid, } +#[derive(Debug, Serialize, Deserialize, Clone, Hash)] +pub struct RunnerConfig { + pub image: Image, + pub root_user_enabled: bool, + pub resources: Resources, + pub env: HashableMap, + pub ports: HashableMap, + pub network_mode: NetworkMode, +} + +#[derive(Debug, Serialize, Deserialize, Clone, Hash)] +#[serde(rename_all = "snake_case")] +pub enum ActorRunner { + New { + runner_id: Uuid, + config: RunnerConfig, + }, + Existing { + runner_id: Uuid, + }, +} + +impl ActorRunner { + pub fn runner_id(&self) -> Uuid { + match self { + ActorRunner::New { runner_id, .. } | ActorRunner::Existing { runner_id } => *runner_id, + } + } +} + #[derive(Debug, Clone, Serialize, Deserialize, Hash)] pub struct EventWrapper { pub index: i64, @@ -328,6 +370,7 @@ pub struct ProxiedPort { pub enum ClientFlavor { Container = 0, Isolate = 1, + Multi = 2, } impl std::fmt::Display for ClientFlavor { @@ -335,6 +378,7 @@ impl std::fmt::Display for ClientFlavor { match self { ClientFlavor::Container => write!(f, "container"), ClientFlavor::Isolate => write!(f, "isolate"), + ClientFlavor::Multi => write!(f, "multi"), } } } @@ -346,6 +390,7 @@ impl std::str::FromStr for ClientFlavor { match s { "container" => Ok(ClientFlavor::Container), "isolate" => Ok(ClientFlavor::Isolate), + "multi" => Ok(ClientFlavor::Multi), x => Err(PegboardProtocolError::InvalidClientFlavor(x.to_string())), } } diff --git a/packages/edge/services/pegboard/src/workflows/actor/runtime.rs b/packages/edge/services/pegboard/src/workflows/actor/runtime.rs index 7018e90673..f43e85b53d 100644 --- a/packages/edge/services/pegboard/src/workflows/actor/runtime.rs +++ b/packages/edge/services/pegboard/src/workflows/actor/runtime.rs @@ -689,9 +689,12 @@ pub async fn spawn_actor( artifact_size_bytes: actor_setup.artifact_size_bytes, kind: actor_setup.meta.build_kind.into(), compression: actor_setup.meta.build_compression.into(), + // Always single, this is the old actor wf + allocation_type: protocol::ImageAllocationType::Single, }, root_user_enabled: input.root_user_enabled, - env: input.environment.clone(), + env: input.environment.as_hashable(), + runner: None, ports: ports_res .ports .iter() diff --git a/packages/edge/services/pegboard/src/workflows/actor2/destroy.rs b/packages/edge/services/pegboard/src/workflows/actor2/destroy.rs new file mode 100644 index 0000000000..9eaa5d3efe --- /dev/null +++ b/packages/edge/services/pegboard/src/workflows/actor2/destroy.rs @@ -0,0 +1,467 @@ +use build::types::BuildAllocationType; +use chirp_workflow::prelude::*; +use fdb_util::{end_of_key_range, FormalKey, SERIALIZABLE}; +use foundationdb::{self as fdb, options::ConflictRangeType}; +use nix::sys::signal::Signal; + +use super::{DestroyComplete, DestroyStarted}; +use crate::{keys, protocol, types::GameGuardProtocol}; + +#[derive(Debug, Serialize, Deserialize)] +pub struct KillCtx { + pub generation: u32, + pub kill_timeout_ms: i64, +} + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct Input { + pub actor_id: Uuid, + pub image_id: Uuid, + pub build_allocation_type: Option, + /// Whether or not to send signals to the pb actor. In the case that the actor was already stopped + /// or exited, signals are unnecessary. + pub kill: Option, +} + +#[workflow] +pub(crate) async fn pegboard_actor_destroy( + ctx: &mut WorkflowCtx, + input: &Input, +) -> GlobalResult<()> { + ctx.msg(DestroyStarted {}) + .tag("actor_id", input.actor_id) + .send() + .await?; + + let actor = ctx.activity(UpdateDbInput {}).await?; + + if let Some(actor) = actor { + let client_workflow_id = actor.client_workflow_id; + + ctx.activity(UpdateFdbInput { + actor_id: input.actor_id, + image_id: input.image_id, + build_allocation_type: input.build_allocation_type, + actor, + }) + .await?; + + if let (Some(client_workflow_id), Some(kill_data)) = (client_workflow_id, &input.kill) { + kill( + ctx, + input.actor_id, + kill_data.generation, + client_workflow_id, + kill_data.kill_timeout_ms, + false, + ) + .await?; + } + } + + ctx.msg(DestroyComplete {}) + .tag("actor_id", input.actor_id) + .send() + .await?; + + Ok(()) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +struct UpdateDbInput {} + +#[derive(Debug, Serialize, Deserialize, Hash, sqlx::FromRow)] +struct UpdateDbOutput { + env_id: Uuid, + selected_resources_memory_mib: Option, + selected_resources_cpu_millicores: Option, + tags: sqlx::types::Json>, + create_ts: i64, + runner_id: Option, + client_id: Option, + client_workflow_id: Option, +} + +#[activity(UpdateDb)] +async fn update_db( + ctx: &ActivityCtx, + input: &UpdateDbInput, +) -> GlobalResult> { + let pool = ctx.sqlite().await?; + + sql_fetch_optional!( + [ctx, UpdateDbOutput, pool] + " + UPDATE state + SET destroy_ts = ? + WHERE destroy_ts IS NULL + RETURNING + env_id, + selected_resources_memory_mib, + selected_resources_cpu_millicores, + json(tags) AS tags, + create_ts, + runner_id, + client_id, + client_workflow_id + ", + ctx.ts(), + ) + .await +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +pub struct UpdateFdbInput { + actor_id: Uuid, + image_id: Uuid, + build_allocation_type: Option, + actor: UpdateDbOutput, +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +pub struct UpdateFdbOutput { + destroy_runner: bool, +} + +#[activity(UpdateFdb)] +pub async fn update_fdb( + ctx: &ActivityCtx, + input: &UpdateFdbInput, +) -> GlobalResult { + let pool = ctx.sqlite().await?; + + let ingress_ports = sql_fetch_all!( + [ctx, (i64, i64), pool] + " + SELECT protocol, ingress_port_number + FROM ports_ingress + ", + ) + .await?; + + let destroy_runner = ctx + .fdb() + .await? + .run(|tx, _mc| { + let ingress_ports = ingress_ports.clone(); + async move { + // Update actor key index in env subspace + let actor_key = keys::env::ActorKey::new( + input.actor.env_id, + input.actor.create_ts, + input.actor_id, + ); + let data = keys::env::ActorKeyData { + is_destroyed: true, + tags: input.actor.tags.0.clone().into_iter().collect(), + }; + tx.set( + &keys::subspace().pack(&actor_key), + &actor_key + .serialize(data) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + clear_ports_and_resources( + input.actor_id, + input.image_id, + input.build_allocation_type, + ingress_ports, + input.actor.runner_id, + input.actor.client_id, + input.actor.client_workflow_id, + input.actor.selected_resources_memory_mib, + input.actor.selected_resources_cpu_millicores, + &tx, + ) + .await + } + }) + .custom_instrument(tracing::info_span!("actor_destroy_tx")) + .await?; + + Ok(UpdateFdbOutput { destroy_runner }) +} + +// TODO: Clean up args +/// Clears allocated ports and resources (if they were allocated). +pub(crate) async fn clear_ports_and_resources( + actor_id: Uuid, + image_id: Uuid, + build_allocation_type: Option, + ingress_ports: Vec<(i64, i64)>, + runner_id: Option, + client_id: Option, + client_workflow_id: Option, + selected_resources_memory_mib: Option, + selected_resources_cpu_millicores: Option, + tx: &fdb::RetryableTransaction, +) -> Result { + // Remove all allocated ingress ports + for (protocol, port) in ingress_ports { + let ingress_port_key = keys::port::IngressKey::new( + GameGuardProtocol::from_repr( + usize::try_from(protocol) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ) + .ok_or_else(|| { + fdb::FdbBindingError::CustomError( + format!("invalid protocol variant: {protocol}").into(), + ) + })?, + u16::try_from(port).map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + actor_id, + ); + + tx.clear(&keys::subspace().pack(&ingress_port_key)); + } + + // Remove proxied ports + let proxied_ports_key = keys::actor::ProxiedPortsKey::new(actor_id); + tx.clear(&keys::subspace().pack(&proxied_ports_key)); + + if let Some(client_id) = client_id { + // This is cleared when the state changes as well as when the actor is destroyed to ensure + // consistency during rescheduling and forced deletion. + let actor_key = keys::client::ActorKey::new(client_id, actor_id); + tx.clear(&keys::subspace().pack(&actor_key)); + } + + // Release client's resources and update allocation index + if let ( + Some(build_allocation_type), + Some(runner_id), + Some(client_id), + Some(client_workflow_id), + Some(selected_resources_memory_mib), + Some(selected_resources_cpu_millicores), + ) = ( + build_allocation_type, + runner_id, + client_id, + client_workflow_id, + selected_resources_memory_mib, + selected_resources_cpu_millicores, + ) { + let client_flavor = protocol::ClientFlavor::Multi; + + let runner_remaining_slots_key = keys::runner::RemainingSlotsKey::new(runner_id); + let runner_remaining_slots_key_buf = keys::subspace().pack(&runner_remaining_slots_key); + let runner_total_slots_key = keys::runner::TotalSlotsKey::new(runner_id); + let runner_total_slots_key_buf = keys::subspace().pack(&runner_total_slots_key); + let client_remaining_mem_key = keys::client::RemainingMemoryKey::new(client_id); + let client_remaining_mem_key_buf = keys::subspace().pack(&client_remaining_mem_key); + let client_remaining_cpu_key = keys::client::RemainingCpuKey::new(client_id); + let client_remaining_cpu_key_buf = keys::subspace().pack(&client_remaining_cpu_key); + let client_last_ping_ts_key = keys::client::LastPingTsKey::new(client_id); + let client_last_ping_ts_key_buf = keys::subspace().pack(&client_last_ping_ts_key); + + let ( + runner_remaining_slots_entry, + runner_total_slots_entry, + client_remaining_mem_entry, + client_remaining_cpu_entry, + client_last_ping_ts_entry, + ) = tokio::try_join!( + tx.get(&runner_remaining_slots_key_buf, SERIALIZABLE), + tx.get(&runner_total_slots_key_buf, SERIALIZABLE), + tx.get(&client_remaining_mem_key_buf, SERIALIZABLE), + tx.get(&client_remaining_cpu_key_buf, SERIALIZABLE), + tx.get(&client_last_ping_ts_key_buf, SERIALIZABLE), + )?; + + let runner_remaining_slots = runner_remaining_slots_key + .deserialize( + &runner_remaining_slots_entry.ok_or(fdb::FdbBindingError::CustomError( + format!("key should exist: {runner_remaining_slots_key:?}").into(), + ))?, + ) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + let runner_total_slots = runner_total_slots_key + .deserialize( + &runner_total_slots_entry.ok_or(fdb::FdbBindingError::CustomError( + format!("key should exist: {runner_total_slots_key:?}").into(), + ))?, + ) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + let client_remaining_mem = client_remaining_mem_key + .deserialize( + &client_remaining_mem_entry.ok_or(fdb::FdbBindingError::CustomError( + format!("key should exist: {client_remaining_mem_key:?}").into(), + ))?, + ) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + let client_remaining_cpu = client_remaining_cpu_key + .deserialize( + &client_remaining_cpu_entry.ok_or(fdb::FdbBindingError::CustomError( + format!("key should exist: {client_remaining_cpu_key:?}").into(), + ))?, + ) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + let client_last_ping_ts = client_last_ping_ts_key + .deserialize( + &client_last_ping_ts_entry.ok_or(fdb::FdbBindingError::CustomError( + format!("key should exist: {client_last_ping_ts_key:?}").into(), + ))?, + ) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + let old_runner_allocation_key = keys::datacenter::RunnersByRemainingSlotsKey::new( + image_id, + runner_remaining_slots, + runner_id, + ); + let old_runner_allocation_key_buf = keys::subspace().pack(&old_runner_allocation_key); + + let old_client_allocation_key = keys::datacenter::ClientsByRemainingMemKey::new( + client_flavor, + client_remaining_mem, + client_last_ping_ts, + client_id, + ); + let old_client_allocation_key_buf = keys::subspace().pack(&old_client_allocation_key); + + let new_runner_remaining_slots = runner_remaining_slots + 1; + + // Write new remaining slots + tx.set( + &runner_remaining_slots_key_buf, + &runner_remaining_slots_key + .serialize(new_runner_remaining_slots) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + // Clear old key + tx.clear(&old_runner_allocation_key_buf); + + // Add read conflict + tx.add_conflict_range( + &old_runner_allocation_key_buf, + &end_of_key_range(&old_runner_allocation_key_buf), + ConflictRangeType::Read, + )?; + + let destroy_runner = if new_runner_remaining_slots < runner_total_slots { + let new_runner_allocation_key = keys::datacenter::RunnersByRemainingSlotsKey::new( + image_id, + new_runner_remaining_slots, + runner_id, + ); + let new_runner_allocation_key_buf = keys::subspace().pack(&new_runner_allocation_key); + + tx.set( + &new_runner_allocation_key_buf, + &new_runner_allocation_key + .serialize(keys::datacenter::RunnersByRemainingSlotsKeyData { + client_id, + client_workflow_id, + }) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + false + } + // Runner is now empty, release client resources + else { + let new_client_remaining_mem = client_remaining_mem + + u64::try_from(selected_resources_memory_mib) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + let new_client_remaining_cpu = client_remaining_cpu + + u64::try_from(selected_resources_cpu_millicores) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + tracing::debug!( + old_mem=%client_remaining_mem, + old_cpu=%client_remaining_cpu, + new_mem=%new_client_remaining_mem, + new_cpu=%new_client_remaining_cpu, + "releasing resources" + ); + + // Write new memory + tx.set( + &client_remaining_mem_key_buf, + &client_remaining_mem_key + .serialize(new_client_remaining_mem) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + // Write new cpu + tx.set( + &client_remaining_cpu_key_buf, + &client_remaining_cpu_key + .serialize(new_client_remaining_cpu) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + // Only update allocation idx if it existed before + if tx + .get(&old_client_allocation_key_buf, SERIALIZABLE) + .await? + .is_some() + { + // Clear old key + tx.clear(&old_client_allocation_key_buf); + + let new_client_allocation_key = keys::datacenter::ClientsByRemainingMemKey::new( + client_flavor, + new_client_remaining_mem, + client_last_ping_ts, + client_id, + ); + let new_client_allocation_key_buf = + keys::subspace().pack(&new_client_allocation_key); + + tx.set( + &new_client_allocation_key_buf, + &new_client_allocation_key + .serialize(client_workflow_id) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + } + + // Single container per runner allocations don't require explicitly destroying the runner because + // it is already stopped; the sole container it was running stopped. + matches!(build_allocation_type, BuildAllocationType::Multi) + }; + + Ok(destroy_runner) + } else { + Ok(false) + } +} + +pub(crate) async fn kill( + ctx: &mut WorkflowCtx, + actor_id: Uuid, + generation: u32, + client_workflow_id: Uuid, + kill_timeout_ms: i64, + persist_storage: bool, +) -> GlobalResult<()> { + if kill_timeout_ms != 0 { + ctx.signal(protocol::Command::SignalActor { + actor_id, + generation, + signal: Signal::SIGTERM as i32, + persist_storage, + }) + .to_workflow_id(client_workflow_id) + .send() + .await?; + + // See `docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md` + ctx.sleep(kill_timeout_ms).await?; + } + + ctx.signal(protocol::Command::SignalActor { + actor_id, + generation, + signal: Signal::SIGKILL as i32, + persist_storage, + }) + .to_workflow_id(client_workflow_id) + .send() + .await?; + + Ok(()) +} diff --git a/packages/edge/services/pegboard/src/workflows/actor2/migrations.rs b/packages/edge/services/pegboard/src/workflows/actor2/migrations.rs new file mode 100644 index 0000000000..952da054c7 --- /dev/null +++ b/packages/edge/services/pegboard/src/workflows/actor2/migrations.rs @@ -0,0 +1,78 @@ +use chirp_workflow::prelude::*; +use sqlx::Acquire; + +pub async fn run(ctx: &mut WorkflowCtx) -> GlobalResult<()> { + ctx.activity(MigrateInitInput {}).await?; + + Ok(()) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +struct MigrateInitInput {} + +#[activity(MigrateInit)] +async fn migrate_init(ctx: &ActivityCtx, _input: &MigrateInitInput) -> GlobalResult<()> { + let pool = ctx.sqlite().await?; + let mut conn = pool.conn().await?; + let mut tx = conn.begin().await?; + + sql_execute!( + [ctx, @tx &mut tx] + " + CREATE TABLE state ( + env_id BLOB NOT NULL, -- UUID + tags BLOB NOT NULL, -- JSONB, map + + resources_cpu_millicores INT NOT NULL, + resources_memory_mib INT NOT NULL, + + -- Chosen based on tier + selected_resources_cpu_millicores INT, + selected_resources_memory_mib INT, + + runner_id BLOB, -- UUID + client_id BLOB, -- UUID + client_workflow_id BLOB, -- UUID + client_wan_hostname TEXT, + + lifecycle_kill_timeout_ms INT NOT NULL, + lifecycle_durable INT NOT NULL DEFAULT false, -- BOOLEAN + + create_ts INT NOT NULL, + start_ts INT, + connectable_ts INT, + finish_ts INT, + destroy_ts INT, + + image_id BLOB NOT NULL, -- UUID + args BLOB NOT NULL, -- JSONB, list + network_mode INT NOT NULL, -- pegboard::types::NetworkMode + environment BLOB NOT NULL -- JSONB, map + ) STRICT; + + CREATE TABLE ports_ingress ( + port_name TEXT PRIMARY KEY, + port_number INT, + ingress_port_number INT NOT NULL, + protocol INT NOT NULL -- pegboard::types::GameGuardProtocol + ) STRICT; + + CREATE TABLE ports_host ( + port_name TEXT PRIMARY KEY, + port_number INT, + protocol INT NOT NULL -- pegboard::types::HostProtocol + ) STRICT; + + CREATE TABLE ports_proxied ( + port_name TEXT PRIMARY KEY, + ip TEXT NOT NULL, + source INT NOT NULL + ) STRICT; + ", + ) + .await?; + + tx.commit().await?; + + Ok(()) +} diff --git a/packages/edge/services/pegboard/src/workflows/actor2/mod.rs b/packages/edge/services/pegboard/src/workflows/actor2/mod.rs new file mode 100644 index 0000000000..4b8419d48d --- /dev/null +++ b/packages/edge/services/pegboard/src/workflows/actor2/mod.rs @@ -0,0 +1,513 @@ +use std::collections::HashMap; + +use chirp_workflow::prelude::*; +use destroy::KillCtx; +use futures_util::FutureExt; +use util::serde::AsHashableExt; + +use crate::{ + protocol, + types::{ActorLifecycle, ActorResources, EndpointType, NetworkMode, Routing}, +}; + +pub mod destroy; +mod migrations; +mod runtime; +mod setup; + +// A small amount of time to separate the completion of the drain to the deletion of the cluster server. We +// want the drain to complete first. +const DRAIN_PADDING_MS: i64 = 10000; +/// Time to delay an actor from rescheduling after a rescheduling failure. +const BASE_RETRY_TIMEOUT_MS: usize = 2000; +/// How long to wait after creating and not receiving a starting state before setting actor as lost. +const ACTOR_START_THRESHOLD_MS: i64 = util::duration::seconds(30); +/// How long to wait after stopping and not receiving a stop state before setting actor as lost. +const ACTOR_STOP_THRESHOLD_MS: i64 = util::duration::seconds(30); +/// How long to wait after stopped and not receiving an exit state before setting actor as lost. +const ACTOR_EXIT_THRESHOLD_MS: i64 = util::duration::seconds(5); + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct Input { + pub actor_id: Uuid, + pub env_id: Uuid, + pub tags: HashMap, + pub resources: ActorResources, + pub lifecycle: ActorLifecycle, + pub image_id: Uuid, + pub root_user_enabled: bool, + pub args: Vec, + pub network_mode: NetworkMode, + pub environment: HashMap, + pub network_ports: HashMap, + pub endpoint_type: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, Hash)] +pub struct Port { + // Null when using host networking since one is automatically assigned + pub internal_port: Option, + pub routing: Routing, +} + +#[workflow] +pub async fn pegboard_actor2(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResult<()> { + migrations::run(ctx).await?; + + let validation_res = ctx + .activity(setup::ValidateInput { + env_id: input.env_id, + tags: input.tags.as_hashable(), + resources: input.resources.clone(), + image_id: input.image_id, + root_user_enabled: input.root_user_enabled, + args: input.args.clone(), + network_mode: input.network_mode, + environment: input.environment.as_hashable(), + network_ports: input.network_ports.as_hashable(), + }) + .await?; + + if let Some(error_message) = validation_res { + ctx.msg(Failed { + message: error_message, + }) + .tag("actor_id", input.actor_id) + .send() + .await?; + + // TODO(RVT-3928): return Ok(Err); + return Ok(()); + } + + let network_ports = ctx + .activity(setup::DisableTlsPortsInput { + network_ports: input.network_ports.as_hashable(), + }) + .await?; + + let res = setup::setup( + ctx, + input, + setup::SetupCtx::Init { + network_ports: network_ports.clone(), + }, + ) + .await; + let initial_actor_setup = match ctx.catch_unrecoverable(res)? { + Ok(res) => res, + Err(err) => { + tracing::error!(?err, "unrecoverable setup"); + + ctx.msg(Failed { + message: "Failed setup.".into(), + }) + .tag("actor_id", input.actor_id) + .send() + .await?; + + ctx.workflow(destroy::Input { + actor_id: input.actor_id, + image_id: input.image_id, + build_allocation_type: None, + kill: None, + }) + .output() + .await?; + + // Throw the original error from the setup activities + return Err(err); + } + }; + + ctx.msg(CreateComplete {}) + .tag("actor_id", input.actor_id) + .send() + .await?; + + let Some(allocate_res) = runtime::spawn_actor(ctx, input, &initial_actor_setup, 0).await? + else { + ctx.msg(Failed { + message: "Failed to allocate (no availability).".into(), + }) + .tag("actor_id", input.actor_id) + .send() + .await?; + + ctx.workflow(destroy::Input { + actor_id: input.actor_id, + image_id: input.image_id, + build_allocation_type: Some(initial_actor_setup.meta.build_allocation_type), + kill: None, + }) + .output() + .await?; + + return Ok(()); + }; + + let lifecycle_res = ctx + .loope( + runtime::State::new( + allocate_res.runner_id, + allocate_res.client_id, + allocate_res.client_workflow_id, + input.image_id, + ), + |ctx, state| { + let input = input.clone(); + + async move { + let sig = if let Some(drain_timeout_ts) = state.drain_timeout_ts { + // Listen for signal with drain timeout + if let Some(sig) = ctx.listen_until::
(drain_timeout_ts).await? { + sig + } + // Reschedule durable actor on drain end + else if input.lifecycle.durable { + ctx.activity(runtime::SetConnectableInput { connectable: false }) + .await?; + + // Kill old actor immediately + destroy::kill( + ctx, + input.actor_id, + state.generation, + state.client_workflow_id, + 0, + true, + ) + .await?; + + if let Some(sig) = + runtime::reschedule_actor(ctx, &input, state, state.image_id) + .await? + { + // Destroyed early + return Ok(Loop::Break(runtime::LifecycleRes { + image_id: state.image_id, + kill: Some(KillCtx { + generation: state.generation, + kill_timeout_ms: sig + .override_kill_timeout_ms + .unwrap_or(input.lifecycle.kill_timeout_ms), + }), + })); + } else { + state.drain_timeout_ts = None; + return Ok(Loop::Continue); + } + } else { + return Ok(Loop::Break(runtime::LifecycleRes { + image_id: state.image_id, + kill: Some(KillCtx { + generation: state.generation, + kill_timeout_ms: input.lifecycle.kill_timeout_ms, + }), + })); + } + } else if let Some(gc_timeout_ts) = state.gc_timeout_ts { + // Listen for signal with gc timeout. if a timeout happens, it means this actor is lost + if let Some(sig) = ctx.listen_until::
(gc_timeout_ts).await? { + sig + } else { + tracing::warn!(actor_id=?input.actor_id, "actor lost"); + + // Fake signal + Main::StateUpdate(StateUpdate { + generation: state.generation, + state: protocol::ActorState::Lost, + }) + } + } else { + // Listen for signal normally + ctx.listen::
().await? + }; + + match sig { + Main::StateUpdate(sig) => { + // Ignore state updates for previous generations + if sig.generation != state.generation { + return Ok(Loop::Continue); + } + + ctx.activity(runtime::UpdateFdbInput { + actor_id: input.actor_id, + client_id: state.client_id, + state: sig.state.clone(), + }) + .await?; + + match sig.state { + protocol::ActorState::Starting => { + state.gc_timeout_ts = None; + + ctx.activity(runtime::SetStartedInput {}).await?; + } + protocol::ActorState::Running { ports, .. } => { + ctx.join(( + activity(runtime::InsertPortsInput { + ports: ports.clone(), + }), + activity(runtime::InsertPortsFdbInput { + actor_id: input.actor_id, + ports, + }), + )) + .await?; + + let updated = ctx + .activity(runtime::SetConnectableInput { + connectable: true, + }) + .await?; + + if updated { + ctx.msg(Ready {}) + .tag("actor_id", input.actor_id) + .send() + .await?; + } + } + protocol::ActorState::Stopping => { + state.gc_timeout_ts = + Some(util::timestamp::now() + ACTOR_STOP_THRESHOLD_MS); + } + protocol::ActorState::Stopped => { + state.gc_timeout_ts = + Some(util::timestamp::now() + ACTOR_EXIT_THRESHOLD_MS); + } + protocol::ActorState::Exited { .. } + | protocol::ActorState::Lost => { + let exit_code = + if let protocol::ActorState::Exited { exit_code } = + sig.state + { + exit_code + } else { + None + }; + + tracing::debug!(?exit_code, "actor stopped"); + + let failed = + exit_code.map(|exit_code| exit_code != 0).unwrap_or(true); + + // Reschedule durable actor if it errored + if input.lifecycle.durable && failed { + ctx.activity(runtime::SetConnectableInput { + connectable: false, + }) + .await?; + + // Kill old actor immediately if lost + if let protocol::ActorState::Lost = sig.state { + destroy::kill( + ctx, + input.actor_id, + state.generation, + state.client_workflow_id, + 0, + true, + ) + .await?; + } + + if runtime::reschedule_actor( + ctx, + &input, + state, + state.image_id, + ) + .await? + .is_some() + { + // Destroyed early + return Ok(Loop::Break(runtime::LifecycleRes { + image_id: state.image_id, + // None here because if we received the destroy signal, it is + // guaranteed that we did not allocate another actor. + kill: None, + })); + } + } else { + ctx.activity(runtime::SetFinishedInput {}).await?; + + if let protocol::ActorState::Lost = sig.state { + ctx.msg(Failed { + message: + "Actor timed out trying to reach a ready state." + .into(), + }) + .tag("actor_id", input.actor_id) + .send() + .await?; + } + + return Ok(Loop::Break(runtime::LifecycleRes { + image_id: state.image_id, + // No need to kill if already exited + kill: matches!(sig.state, protocol::ActorState::Lost) + .then_some(KillCtx { + generation: state.generation, + kill_timeout_ms: 0, + }), + })); + } + } + } + } + Main::Upgrade(sig) => { + ctx.msg(UpgradeStarted {}) + .tag("actor_id", input.actor_id) + .send() + .await?; + + ctx.activity(runtime::SetConnectableInput { connectable: false }) + .await?; + + // Kill old actor immediately + destroy::kill( + ctx, + input.actor_id, + state.generation, + state.client_workflow_id, + 0, + true, + ) + .await?; + + ctx.activity(runtime::UpdateImageInput { + image_id: sig.image_id, + }) + .await?; + state.image_id = sig.image_id; + + if let Some(sig) = + runtime::reschedule_actor(ctx, &input, state, state.image_id) + .await? + { + // Destroyed early + return Ok(Loop::Break(runtime::LifecycleRes { + image_id: input.image_id, + kill: Some(KillCtx { + generation: state.generation, + kill_timeout_ms: sig + .override_kill_timeout_ms + .unwrap_or(input.lifecycle.kill_timeout_ms), + }), + })); + } + + ctx.msg(UpgradeComplete {}) + .tag("actor_id", input.actor_id) + .send() + .await?; + } + Main::Drain(sig) => { + state.drain_timeout_ts = Some( + sig.drain_timeout_ts + - DRAIN_PADDING_MS - input.lifecycle.kill_timeout_ms, + ); + } + Main::Undrain(_) => { + state.drain_timeout_ts = None; + } + Main::Destroy(sig) => { + return Ok(Loop::Break(runtime::LifecycleRes { + image_id: input.image_id, + kill: Some(KillCtx { + generation: state.generation, + kill_timeout_ms: sig + .override_kill_timeout_ms + .unwrap_or(input.lifecycle.kill_timeout_ms), + }), + })) + } + } + + Ok(Loop::Continue) + } + .boxed() + }, + ) + .await?; + + ctx.workflow(destroy::Input { + actor_id: input.actor_id, + image_id: lifecycle_res.image_id, + build_allocation_type: Some(initial_actor_setup.meta.build_allocation_type), + kill: lifecycle_res.kill, + }) + .output() + .await?; + + Ok(()) +} + +#[message("pegboard_actor_create_complete")] +pub struct CreateComplete {} + +#[message("pegboard_actor_failed")] +pub struct Failed { + pub message: String, +} + +#[message("pegboard_actor_ready")] +pub struct Ready {} + +#[signal("pegboard_actor_destroy")] +pub struct Destroy { + pub override_kill_timeout_ms: Option, +} + +#[signal("pegboard_actor_drain")] +pub struct Drain { + pub drain_timeout_ts: i64, +} + +#[signal("pegboard_actor_undrain")] +pub struct Undrain {} + +#[message("pegboard_actor_destroy_started")] +pub struct DestroyStarted {} + +#[message("pegboard_actor_destroy_complete")] +pub struct DestroyComplete {} + +#[signal("pegboard_actor_upgrade")] +pub struct Upgrade { + pub image_id: Uuid, +} + +#[signal("pegboard_actor_state_update")] +pub struct StateUpdate { + #[serde(default)] + pub generation: u32, + pub state: protocol::ActorState, +} + +#[message("pegboard_actor_upgrade_started")] +pub struct UpgradeStarted {} + +#[message("pegboard_actor_upgrade_complete")] +pub struct UpgradeComplete {} + +join_signal!(Main { + StateUpdate, + Upgrade, + Drain, + Undrain, + Destroy, +}); + +// Stub definition +#[derive(Debug, Serialize, Deserialize, Hash)] +pub struct WaitForTraefikPollInput {} +#[activity(WaitForTraefikPoll)] +pub async fn wait_for_traefik_poll( + _ctx: &ActivityCtx, + _input: &WaitForTraefikPollInput, +) -> GlobalResult<()> { + Ok(()) +} diff --git a/packages/edge/services/pegboard/src/workflows/actor2/runtime.rs b/packages/edge/services/pegboard/src/workflows/actor2/runtime.rs new file mode 100644 index 0000000000..e97470d3a8 --- /dev/null +++ b/packages/edge/services/pegboard/src/workflows/actor2/runtime.rs @@ -0,0 +1,1067 @@ +use std::time::Instant; + +use build::types::{BuildAllocationType, BuildCompression, BuildKind}; +use chirp_workflow::prelude::*; +use fdb_util::{end_of_key_range, FormalKey, SERIALIZABLE, SNAPSHOT}; +use foundationdb::{ + self as fdb, + options::{ConflictRangeType, StreamingMode}, +}; +use futures_util::{FutureExt, TryStreamExt}; +use sqlx::Acquire; +use util::serde::AsHashableExt; + +use super::{ + destroy::{self, KillCtx}, + setup, Destroy, Input, ACTOR_START_THRESHOLD_MS, BASE_RETRY_TIMEOUT_MS, +}; +use crate::{ + keys, metrics, + ops::actor::get, + protocol, + types::{EndpointType, GameGuardProtocol, HostProtocol, NetworkMode, Port, Routing}, + workflows::client::CLIENT_ELIGIBLE_THRESHOLD_MS, +}; + +#[derive(Deserialize, Serialize)] +pub struct State { + pub generation: u32, + pub runner_id: Uuid, + pub client_id: Uuid, + pub client_workflow_id: Uuid, + pub image_id: Uuid, + pub drain_timeout_ts: Option, + pub gc_timeout_ts: Option, +} + +impl State { + pub fn new(runner_id: Uuid, client_id: Uuid, client_workflow_id: Uuid, image_id: Uuid) -> Self { + State { + generation: 0, + client_id, + client_workflow_id, + runner_id, + image_id, + drain_timeout_ts: None, + gc_timeout_ts: Some(util::timestamp::now() + ACTOR_START_THRESHOLD_MS), + } + } +} + +#[derive(Serialize, Deserialize)] +pub struct LifecycleRes { + pub image_id: Uuid, + pub kill: Option, +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +struct UpdateClientInput { + client_id: Uuid, + client_workflow_id: Uuid, +} + +#[activity(UpdateClient)] +async fn update_client(ctx: &ActivityCtx, input: &UpdateClientInput) -> GlobalResult<()> { + let client_pool = ctx.sqlite_for_workflow(input.client_workflow_id).await?; + let pool = ctx.sqlite().await?; + + let (client_wan_hostname,) = sql_fetch_one!( + [ctx, (String,), client_pool] + " + SELECT config->'network'->>'wan_hostname' AS wan_hostname + FROM state + ", + ) + .await?; + + sql_execute!( + [ctx, pool] + " + UPDATE state + SET + client_id = ?, + client_workflow_id = ?, + client_wan_hostname = ? + ", + input.client_id, + input.client_workflow_id, + client_wan_hostname, + ) + .await?; + + Ok(()) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +struct FetchPortsInput { + actor_id: Uuid, + endpoint_type: Option, +} + +#[derive(Debug, Serialize, Deserialize)] +struct FetchPortsOutput { + ports: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +struct FetchedPort { + name: String, + port_number: Option, + port: Port, +} + +#[activity(FetchPorts)] +async fn fetch_ports(ctx: &ActivityCtx, input: &FetchPortsInput) -> GlobalResult { + let pool = ctx.sqlite().await?; + + let dc_id = ctx.config().server()?.rivet.edge()?.datacenter_id; + + let ((wan_hostname,), port_ingress_rows, port_host_rows, dc_res) = tokio::try_join!( + sql_fetch_one!( + [ctx, (Option,), &pool] + " + SELECT client_wan_hostname + FROM state + ", + ), + sql_fetch_all!( + [ctx, get::PortIngress, &pool] + " + SELECT + port_name, + port_number, + ingress_port_number, + protocol + FROM ports_ingress + ", + ), + sql_fetch_all!( + [ctx, get::PortHost, &pool] + " + SELECT port_name, port_number, protocol + FROM ports_host + ", + ), + ctx.op(cluster::ops::datacenter::get::Input { + datacenter_ids: vec![dc_id], + }), + )?; + + let dc = unwrap!(dc_res.datacenters.first()); + + let endpoint_type = input.endpoint_type.unwrap_or_else(|| { + EndpointType::default_for_guard_public_hostname(&dc.guard_public_hostname) + }); + + let ports = port_ingress_rows + .into_iter() + .map(|row| { + let port = get::create_port_ingress( + input.actor_id, + &row, + unwrap!(GameGuardProtocol::from_repr(row.protocol.try_into()?)), + endpoint_type, + &dc.guard_public_hostname, + )?; + + Ok(FetchedPort { + name: row.port_name, + port_number: row.port_number.map(TryInto::try_into).transpose()?, + port, + }) + }) + .chain(port_host_rows.into_iter().map(|row| { + let port = get::create_port_host( + true, + wan_hostname.as_deref(), + &row, + // Placeholder, will be replaced by the manager when building metadata + Some(&get::PortProxied { + port_name: String::new(), + source: 0, + }), + )?; + + Ok(FetchedPort { + name: row.port_name, + port_number: row.port_number.map(TryInto::try_into).transpose()?, + port, + }) + })) + .collect::>>()?; + + Ok(FetchPortsOutput { ports }) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +struct AllocateActorInput { + actor_id: Uuid, + generation: u32, + image_id: Uuid, + build_allocation_type: BuildAllocationType, + build_allocation_total_slots: u64, + resources: protocol::Resources, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct AllocateActorOutput { + pub runner_id: Uuid, + pub new_runner: bool, + pub client_id: Uuid, + pub client_workflow_id: Uuid, +} + +#[activity(AllocateActor)] +async fn allocate_actor( + ctx: &ActivityCtx, + input: &AllocateActorInput, +) -> GlobalResult> { + let client_flavor = protocol::ClientFlavor::Multi; + let memory_mib = input.resources.memory / 1024 / 1024; + + let start_instant = Instant::now(); + + let res = ctx + .fdb() + .await? + .run(|tx, _mc| async move { + // Select a range that only includes runners that have enough remaining slots to allocate this actor + let start = keys::subspace().pack( + &keys::datacenter::RunnersByRemainingSlotsKey::subspace_with_slots( + input.image_id, + 1, + ), + ); + let runner_allocation_subspace = + keys::datacenter::RunnersByRemainingSlotsKey::subspace(input.image_id); + let end = keys::subspace() + .subspace(&runner_allocation_subspace) + .range() + .1; + + let mut stream = tx.get_ranges_keyvalues( + fdb::RangeOption { + mode: StreamingMode::Iterator, + // Containers bin pack so we reverse the order + reverse: true, + ..(start, end).into() + }, + // NOTE: This is not SERIALIZABLE because we don't want to conflict with all of the keys, just + // the one we choose + SNAPSHOT, + ); + + if let BuildAllocationType::Multi = input.build_allocation_type { + loop { + let Some(entry) = stream.try_next().await? else { + return Ok(None); + }; + + let old_runner_allocation_key = keys::subspace() + .unpack::(entry.key()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + let data = old_runner_allocation_key + .deserialize(entry.value()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + // Add read conflict only for this key + tx.add_conflict_range( + entry.key(), + &end_of_key_range(entry.key()), + ConflictRangeType::Read, + )?; + + // Clear old entry + tx.clear(entry.key()); + + let new_remaining_slots = + old_runner_allocation_key.remaining_slots.saturating_sub(1); + + // Write new allocation key with 1 less slot + let new_allocation_key = keys::datacenter::RunnersByRemainingSlotsKey::new( + input.image_id, + new_remaining_slots, + old_runner_allocation_key.runner_id, + ); + tx.set(&keys::subspace().pack(&new_allocation_key), entry.value()); + + // Update runner record + let remaining_slots_key = + keys::runner::RemainingSlotsKey::new(old_runner_allocation_key.runner_id); + tx.set( + &keys::subspace().pack(&remaining_slots_key), + &remaining_slots_key + .serialize(new_remaining_slots) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + // Insert actor index key + let client_actor_key = + keys::client::ActorKey::new(data.client_id, input.actor_id); + tx.set( + &keys::subspace().pack(&client_actor_key), + &client_actor_key + .serialize(input.generation) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + return Ok(Some(AllocateActorOutput { + runner_id: old_runner_allocation_key.runner_id, + new_runner: false, + client_id: data.client_id, + client_workflow_id: data.client_workflow_id, + })); + } + } + + // No available runner found, create a new one + let runner_id = Uuid::new_v4(); + + let ping_threshold_ts = util::timestamp::now() - CLIENT_ELIGIBLE_THRESHOLD_MS; + + // Select a range that only includes clients that have enough remaining mem to allocate this actor + let start = keys::subspace().pack( + &keys::datacenter::ClientsByRemainingMemKey::subspace_with_mem( + client_flavor, + memory_mib, + ), + ); + let client_allocation_subspace = + keys::datacenter::ClientsByRemainingMemKey::subspace(client_flavor); + let end = keys::subspace() + .subspace(&client_allocation_subspace) + .range() + .1; + + let mut stream = tx.get_ranges_keyvalues( + fdb::RangeOption { + mode: StreamingMode::Iterator, + // Containers bin pack so we reverse the order + reverse: true, + ..(start, end).into() + }, + // NOTE: This is not SERIALIZABLE because we don't want to conflict with all of the keys, just + // the one we choose + SNAPSHOT, + ); + + loop { + let Some(entry) = stream.try_next().await? else { + return Ok(None); + }; + + let old_client_allocation_key = keys::subspace() + .unpack::(entry.key()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + // Scan by last ping + if old_client_allocation_key.last_ping_ts < ping_threshold_ts { + continue; + } + + let client_workflow_id = old_client_allocation_key + .deserialize(entry.value()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + // Add read conflict only for this key + tx.add_conflict_range( + entry.key(), + &end_of_key_range(entry.key()), + ConflictRangeType::Read, + )?; + + // Clear old entry + tx.clear(entry.key()); + + // Read old cpu + let remaining_cpu_key = + keys::client::RemainingCpuKey::new(old_client_allocation_key.client_id); + let remaining_cpu_key_buf = keys::subspace().pack(&remaining_cpu_key); + let remaining_cpu_entry = tx.get(&remaining_cpu_key_buf, SERIALIZABLE).await?; + let old_remaining_cpu = remaining_cpu_key + .deserialize( + &remaining_cpu_entry.ok_or(fdb::FdbBindingError::CustomError( + format!("key should exist: {remaining_cpu_key:?}").into(), + ))?, + ) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + // Update allocated amount + let new_remaining_mem = old_client_allocation_key.remaining_mem - memory_mib; + let new_remaining_cpu = old_remaining_cpu - input.resources.cpu; + let new_allocation_key = keys::datacenter::ClientsByRemainingMemKey::new( + client_flavor, + new_remaining_mem, + old_client_allocation_key.last_ping_ts, + old_client_allocation_key.client_id, + ); + tx.set(&keys::subspace().pack(&new_allocation_key), entry.value()); + + tracing::debug!( + old_mem=%old_client_allocation_key.remaining_mem, + old_cpu=%old_remaining_cpu, + new_mem=%new_remaining_mem, + new_cpu=%new_remaining_cpu, + "allocating runner resources" + ); + + // Update client record + let remaining_mem_key = + keys::client::RemainingMemoryKey::new(old_client_allocation_key.client_id); + tx.set( + &keys::subspace().pack(&remaining_mem_key), + &remaining_mem_key + .serialize(new_remaining_mem) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + tx.set( + &remaining_cpu_key_buf, + &remaining_cpu_key + .serialize(new_remaining_cpu) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + if let BuildAllocationType::Multi = input.build_allocation_type { + let remaining_slots = input.build_allocation_total_slots.saturating_sub(1); + let total_slots = input.build_allocation_total_slots; + + // Insert runner records + let remaining_slots_key = keys::runner::RemainingSlotsKey::new(runner_id); + tx.set( + &keys::subspace().pack(&remaining_slots_key), + &remaining_slots_key + .serialize(remaining_slots) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + let total_slots_key = keys::runner::TotalSlotsKey::new(runner_id); + tx.set( + &keys::subspace().pack(&total_slots_key), + &total_slots_key + .serialize(total_slots) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + // Insert runner index key + let runner_idx_key = keys::datacenter::RunnersByRemainingSlotsKey::new( + input.image_id, + remaining_slots, + runner_id, + ); + tx.set( + &keys::subspace().pack(&runner_idx_key), + &runner_idx_key + .serialize(keys::datacenter::RunnersByRemainingSlotsKeyData { + client_id: old_client_allocation_key.client_id, + client_workflow_id, + }) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + } + + // Insert actor index key + let client_actor_key = keys::client::ActorKey::new( + old_client_allocation_key.client_id, + input.actor_id, + ); + tx.set( + &keys::subspace().pack(&client_actor_key), + &client_actor_key + .serialize(input.generation) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + return Ok(Some(AllocateActorOutput { + runner_id, + new_runner: true, + client_id: old_client_allocation_key.client_id, + client_workflow_id, + })); + } + }) + .custom_instrument(tracing::info_span!("actor_allocate_tx")) + .await?; + + let dt = start_instant.elapsed().as_secs_f64(); + metrics::ACTOR_ALLOCATE_DURATION + .with_label_values(&[&res.is_some().to_string()]) + .observe(dt); + + Ok(res) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +pub struct UpdateFdbInput { + pub actor_id: Uuid, + pub client_id: Uuid, + pub state: protocol::ActorState, +} + +#[activity(UpdateFdb)] +pub async fn update_fdb(ctx: &ActivityCtx, input: &UpdateFdbInput) -> GlobalResult<()> { + use protocol::ActorState::*; + + match &input.state { + Starting | Running { .. } | Stopping => {} + Stopped | Lost | Exited { .. } => { + ctx.fdb() + .await? + .run(|tx, _mc| async move { + // Was inserted when the actor was allocated. This is cleared when the state changes as + // well as when the actor is destroyed to ensure consistency during rescheduling and + // forced deletion. + let actor_key = keys::client::ActorKey::new(input.client_id, input.actor_id); + tx.clear(&keys::subspace().pack(&actor_key)); + + Ok(()) + }) + .custom_instrument(tracing::info_span!("actor_clear_tx")) + .await?; + } + } + + Ok(()) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +pub struct UpdateImageInput { + pub image_id: Uuid, +} + +#[activity(UpdateImage)] +pub async fn update_image(ctx: &ActivityCtx, input: &UpdateImageInput) -> GlobalResult<()> { + let pool = ctx.sqlite().await?; + + sql_execute!( + [ctx, pool] + " + UPDATE state + SET image_id = ? + ", + input.image_id, + ) + .await?; + + Ok(()) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +pub struct SetStartedInput {} + +#[activity(SetStarted)] +pub async fn set_started(ctx: &ActivityCtx, input: &SetStartedInput) -> GlobalResult<()> { + let pool = ctx.sqlite().await?; + let start_ts = util::timestamp::now(); + + let row = sql_fetch_optional!( + [ctx, (i64,), pool] + " + UPDATE state + SET start_ts = ? + WHERE start_ts IS NULL + RETURNING create_ts + ", + start_ts, + ) + .await?; + + // Add start duration if this is the first start + if let Some((create_ts,)) = row { + let dt = (start_ts - create_ts) as f64 / 1000.0; + metrics::ACTOR_START_DURATION + .with_label_values(&[]) + .observe(dt); + } + + Ok(()) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +pub struct SetConnectableInput { + pub connectable: bool, +} + +#[activity(SetConnectable)] +pub async fn set_connectable(ctx: &ActivityCtx, input: &SetConnectableInput) -> GlobalResult { + let pool = ctx.sqlite().await?; + + let res = sql_execute!( + [ctx, pool] + " + UPDATE state + SET connectable_ts = ? + WHERE + CASE WHEN ? + THEN connectable_ts IS NULL + ELSE connectable_ts IS NOT NULL + END + ", + input.connectable.then(util::timestamp::now), + input.connectable, + ) + .await?; + + Ok(res.rows_affected() > 0) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +pub struct InsertPortsInput { + pub ports: util::serde::HashableMap, +} + +#[activity(InsertPorts)] +pub async fn insert_ports(ctx: &ActivityCtx, input: &InsertPortsInput) -> GlobalResult<()> { + let pool = ctx.sqlite().await?; + let mut conn = pool.conn().await?; + let mut tx = conn.begin().await?; + + for (port_name, port) in &input.ports { + sql_execute!( + [ctx, @tx &mut tx] + " + INSERT INTO ports_proxied ( + port_name, + source, + ip + ) + VALUES (?, ?, ?) + ", + port_name, + port.source as i64, + &port.lan_hostname, + ) + .await?; + } + + tx.commit().await?; + + Ok(()) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +pub struct InsertPortsFdbInput { + pub actor_id: Uuid, + pub ports: util::serde::HashableMap, +} + +#[activity(InsertPortsFdb)] +pub async fn insert_ports_fdb(ctx: &ActivityCtx, input: &InsertPortsFdbInput) -> GlobalResult<()> { + let pool = &ctx.sqlite().await?; + + let ((create_ts,), ingress_ports) = tokio::try_join!( + sql_fetch_one!( + [ctx, (i64,), pool] + " + SELECT create_ts + FROM state + ", + ), + sql_fetch_all!( + [ctx, (String, i64, i64), pool] + " + SELECT port_name, ingress_port_number, protocol + FROM ports_ingress + ", + ), + )?; + + let proxied_ports = input + .ports + .iter() + // Match to ingress ports for GG + .filter_map(|(port_name, port)| { + if let Some((_, ingress_port_number, protocol)) = ingress_ports + .iter() + .find(|(ingress_port_name, _, _)| port_name == ingress_port_name) + { + Some((port_name, port, ingress_port_number, protocol)) + } else { + None + } + }) + .map(|(port_name, port, ingress_port_number, protocol)| { + let protocol = unwrap!(GameGuardProtocol::from_repr((*protocol).try_into()?)); + + Ok(keys::actor::ProxiedPort { + port_name: port_name.clone(), + create_ts, + lan_hostname: port.lan_hostname.clone(), + source: port.source, + ingress_port_number: (*ingress_port_number).try_into()?, + protocol, + }) + }) + .collect::>>()?; + + // Write proxied ingress ports to fdb index + ctx.fdb() + .await? + .run(|tx, _mc| { + let proxied_ports = proxied_ports.clone(); + async move { + let proxied_ports_key = keys::actor::ProxiedPortsKey::new(input.actor_id); + + tx.set( + &keys::subspace().pack(&proxied_ports_key), + &proxied_ports_key + .serialize(proxied_ports) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + Ok(()) + } + }) + .custom_instrument(tracing::info_span!("actor_insert_proxied_ports_tx")) + .await?; + + Ok(()) +} + +/// Returns whether or not there was availability to spawn the actor. +pub async fn spawn_actor( + ctx: &mut WorkflowCtx, + input: &Input, + actor_setup: &setup::ActorSetupCtx, + generation: u32, +) -> GlobalResult> { + let res = ctx + .activity(AllocateActorInput { + actor_id: input.actor_id, + generation, + image_id: actor_setup.image_id, + build_allocation_type: actor_setup.meta.build_allocation_type, + build_allocation_total_slots: actor_setup.meta.build_allocation_total_slots, + resources: actor_setup.resources.clone(), + }) + .await?; + + let Some(res) = res else { + return Ok(None); + }; + + let (_, ports_res) = ctx + .join(( + activity(UpdateClientInput { + client_id: res.client_id, + client_workflow_id: res.client_workflow_id, + }), + v(2).activity(FetchPortsInput { + actor_id: input.actor_id, + endpoint_type: input.endpoint_type, + }), + )) + .await?; + + let cluster_id = ctx.config().server()?.rivet.edge()?.cluster_id; + + let image = protocol::Image { + id: actor_setup.image_id, + artifact_url_stub: actor_setup.artifact_url_stub.clone(), + fallback_artifact_url: actor_setup.fallback_artifact_url.clone(), + kind: match actor_setup.meta.build_kind { + BuildKind::DockerImage => protocol::ImageKind::DockerImage, + BuildKind::OciBundle => protocol::ImageKind::OciBundle, + BuildKind::JavaScript => bail!("actors do not support js builds"), + }, + compression: match actor_setup.meta.build_compression { + BuildCompression::None => protocol::ImageCompression::None, + BuildCompression::Lz4 => protocol::ImageCompression::Lz4, + }, + allocation_type: match actor_setup.meta.build_allocation_type { + BuildAllocationType::None => bail!("actors do not support old builds"), + BuildAllocationType::Single => protocol::ImageAllocationType::Single, + BuildAllocationType::Multi => protocol::ImageAllocationType::Multi, + }, + }; + let ports = ports_res + .ports + .iter() + .map(|port| match port.port.routing { + Routing::GameGuard { protocol } => ( + crate::util::pegboard_normalize_port_name(&port.name), + protocol::Port { + target: port.port_number, + protocol: match protocol { + GameGuardProtocol::Http + | GameGuardProtocol::Https + | GameGuardProtocol::Tcp + | GameGuardProtocol::TcpTls => protocol::TransportProtocol::Tcp, + GameGuardProtocol::Udp => protocol::TransportProtocol::Udp, + }, + routing: protocol::PortRouting::GameGuard, + }, + ), + Routing::Host { protocol } => ( + crate::util::pegboard_normalize_port_name(&port.name), + protocol::Port { + target: port.port_number, + protocol: match protocol { + HostProtocol::Tcp => protocol::TransportProtocol::Tcp, + HostProtocol::Udp => protocol::TransportProtocol::Udp, + }, + routing: protocol::PortRouting::Host, + }, + ), + }) + .collect::>(); + let network_mode = match input.network_mode { + NetworkMode::Bridge => protocol::NetworkMode::Bridge, + NetworkMode::Host => protocol::NetworkMode::Host, + }; + + ctx.signal(protocol::Command::StartActor { + actor_id: input.actor_id, + generation, + config: Box::new(protocol::ActorConfig { + runner: if res.new_runner { + Some(protocol::ActorRunner::New { + runner_id: res.runner_id, + config: protocol::RunnerConfig { + image: image.clone(), + root_user_enabled: input.root_user_enabled, + resources: actor_setup.resources.clone(), + env: input.environment.as_hashable(), + ports: ports.clone(), + network_mode, + }, + }) + } else { + Some(protocol::ActorRunner::Existing { + runner_id: res.runner_id, + }) + }, + env: input.environment.as_hashable(), + metadata: util::serde::Raw::new(&protocol::ActorMetadata { + actor: protocol::ActorMetadataActor { + actor_id: input.actor_id, + tags: input.tags.as_hashable(), + create_ts: ctx.ts(), + }, + network: Some(protocol::ActorMetadataNetwork { + ports: ports_res + .ports + .into_iter() + .map(|port| (port.name, port.port)) + .collect(), + }), + project: protocol::ActorMetadataProject { + project_id: actor_setup.meta.project_id, + slug: actor_setup.meta.project_slug.clone(), + }, + environment: protocol::ActorMetadataEnvironment { + env_id: input.env_id, + slug: actor_setup.meta.env_slug.clone(), + }, + datacenter: protocol::ActorMetadataDatacenter { + name_id: actor_setup.meta.dc_name_id.clone(), + display_name: actor_setup.meta.dc_display_name.clone(), + }, + cluster: protocol::ActorMetadataCluster { cluster_id }, + build: protocol::ActorMetadataBuild { + build_id: actor_setup.image_id, + }, + })?, + + // Deprecated + image, + root_user_enabled: input.root_user_enabled, + resources: actor_setup.resources.clone(), + ports, + network_mode, + }), + }) + .to_workflow_id(res.client_workflow_id) + .send() + .await?; + + Ok(Some(res)) +} + +pub async fn reschedule_actor( + ctx: &mut WorkflowCtx, + input: &Input, + state: &mut State, + image_id: Uuid, +) -> GlobalResult> { + tracing::debug!(actor_id=?input.actor_id, "rescheduling actor"); + + ctx.activity(ClearPortsAndResourcesInput { + actor_id: input.actor_id, + image_id, + runner_id: state.runner_id, + client_id: state.client_id, + client_workflow_id: state.client_workflow_id, + }) + .await?; + + let actor_setup = setup::setup(ctx, &input, setup::SetupCtx::Reschedule { image_id }).await?; + + let next_generation = state.generation + 1; + + // Waits for the actor to be ready (or destroyed) and automatically retries if failed to allocate. + let res = ctx + .loope(RescheduleState::default(), |ctx, state| { + let input = input.clone(); + let actor_setup = actor_setup.clone(); + + async move { + // Determine next backoff sleep duration + let mut backoff = + util::Backoff::new_at(8, None, BASE_RETRY_TIMEOUT_MS, 500, state.retry_count); + + // If the last retry ts is more than 2 * backoff ago, reset retry count to 0 + let now = util::timestamp::now(); + state.retry_count = + if state.last_retry_ts < now - i64::try_from(2 * backoff.current_duration())? { + 0 + } else { + state.retry_count + 1 + }; + state.last_retry_ts = now; + + // Don't sleep for first retry + if state.retry_count > 0 { + let next = backoff.step().expect("should not have max retry"); + + // Sleep for backoff or destroy early + if let Some(sig) = ctx + .listen_with_timeout::(Instant::from(next) - Instant::now()) + .await? + { + tracing::debug!("destroying before actor start"); + + return Ok(Loop::Break(Err(sig))); + } + } + + if let Some(res) = spawn_actor(ctx, &input, &actor_setup, next_generation).await? { + Ok(Loop::Break(Ok(res))) + } else { + tracing::debug!(actor_id=?input.actor_id, "failed to reschedule actor, retrying"); + + Ok(Loop::Continue) + } + } + .boxed() + }) + .await?; + + // Update loop state + match res { + Ok(res) => { + state.generation = next_generation; + state.runner_id = res.runner_id; + state.client_id = res.client_id; + state.client_workflow_id = res.client_workflow_id; + + // Reset gc timeout once allocated + state.gc_timeout_ts = Some(util::timestamp::now() + ACTOR_START_THRESHOLD_MS); + + Ok(None) + } + Err(sig) => Ok(Some(sig)), + } +} + +#[derive(Serialize, Deserialize, Default)] +struct RescheduleState { + last_retry_ts: i64, + retry_count: usize, +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +struct ClearPortsAndResourcesInput { + actor_id: Uuid, + image_id: Uuid, + runner_id: Uuid, + client_id: Uuid, + client_workflow_id: Uuid, +} + +#[activity(ClearPortsAndResources)] +async fn clear_ports_and_resources( + ctx: &ActivityCtx, + input: &ClearPortsAndResourcesInput, +) -> GlobalResult<()> { + let pool = &ctx.sqlite().await?; + + let ( + build_res, + ingress_ports, + (selected_resources_cpu_millicores, selected_resources_memory_mib), + _, + ) = tokio::try_join!( + ctx.op(build::ops::get::Input { + build_ids: vec![input.image_id], + }), + sql_fetch_all!( + [ctx, (i64, i64), pool] + " + SELECT protocol, ingress_port_number + FROM ports_ingress + ", + ), + sql_fetch_one!( + [ctx, (Option, Option), pool] + " + SELECT selected_resources_cpu_millicores, selected_resources_memory_mib + FROM state + ", + ), + // Idempotent + sql_execute!( + [ctx, pool] + " + DELETE FROM ports_proxied + ", + ), + )?; + let build = unwrap_with!(build_res.builds.first(), BUILD_NOT_FOUND); + + ctx.fdb() + .await? + .run(|tx, _mc| { + let ingress_ports = ingress_ports.clone(); + async move { + destroy::clear_ports_and_resources( + input.actor_id, + input.image_id, + Some(build.allocation_type), + ingress_ports, + Some(input.runner_id), + Some(input.client_id), + Some(input.client_workflow_id), + selected_resources_memory_mib, + selected_resources_cpu_millicores, + &tx, + ) + .await + } + }) + .custom_instrument(tracing::info_span!("actor_clear_ports_and_resources_tx")) + .await?; + + Ok(()) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +pub struct SetFinishedInput {} + +#[activity(SetFinished)] +pub async fn set_finished(ctx: &ActivityCtx, input: &SetFinishedInput) -> GlobalResult<()> { + let pool = ctx.sqlite().await?; + + sql_execute!( + [ctx, pool] + " + UPDATE state + SET finish_ts = ? + ", + util::timestamp::now(), + ) + .await?; + + Ok(()) +} diff --git a/packages/edge/services/pegboard/src/workflows/actor2/setup.rs b/packages/edge/services/pegboard/src/workflows/actor2/setup.rs new file mode 100644 index 0000000000..47cbd02b42 --- /dev/null +++ b/packages/edge/services/pegboard/src/workflows/actor2/setup.rs @@ -0,0 +1,728 @@ +use build::types::{BuildAllocationType, BuildCompression, BuildKind}; +use chirp_workflow::prelude::*; +use cluster::types::BuildDeliveryMethod; +use fdb_util::FormalKey; +use foundationdb as fdb; +use sqlx::Acquire; +use util::serde::AsHashableExt; + +use super::{Input, Port}; +use crate::{ + keys, protocol, + types::{ActorLifecycle, ActorResources, GameGuardProtocol, NetworkMode, Routing}, +}; + +#[derive(Debug, Clone, Serialize, Deserialize, Hash)] +pub struct ValidateInput { + pub env_id: Uuid, + pub tags: util::serde::HashableMap, + pub resources: ActorResources, + pub image_id: Uuid, + pub root_user_enabled: bool, + pub args: Vec, + pub network_mode: NetworkMode, + pub environment: util::serde::HashableMap, + pub network_ports: util::serde::HashableMap, +} + +// TODO: Redo once a solid global error solution is established so we dont have to have validation all in one +// place. +#[activity(Validate)] +pub async fn validate(ctx: &ActivityCtx, input: &ValidateInput) -> GlobalResult> { + let dc_id = ctx.config().server()?.rivet.edge()?.datacenter_id; + + let (has_tier, upload_res, game_config_res) = tokio::try_join!( + async { + let tier_res = ctx + .op(tier::ops::list::Input { + datacenter_ids: vec![dc_id], + pegboard: true, + }) + .await?; + let tier_dc = unwrap!(tier_res.datacenters.first()); + + // Find any tier that has more CPU and memory than the requested resources + GlobalResult::Ok(tier_dc.tiers.iter().any(|t| { + t.cpu_millicores >= input.resources.cpu_millicores + && t.memory >= input.resources.memory_mib + })) + }, + async { + let builds_res = ctx + .op(build::ops::get::Input { + build_ids: vec![input.image_id], + }) + .await?; + + let Some(build) = builds_res.builds.into_iter().next() else { + return Ok(None); + }; + + let uploads_res = op!([ctx] upload_get { + upload_ids: vec![build.upload_id.into()], + }) + .await?; + + Ok(Some(( + build, + unwrap!(uploads_res.uploads.first()).complete_ts.is_some(), + ))) + }, + async { + let games_res = op!([ctx] game_resolve_namespace_id { + namespace_ids: vec![input.env_id.into()], + }) + .await?; + + let Some(game) = games_res.games.first() else { + return Ok(None); + }; + + let game_config_res = ctx + .op(crate::ops::game_config::get::Input { + game_ids: vec![unwrap!(game.game_id).into()], + }) + .await?; + + Ok(Some(unwrap!(game_config_res.game_configs.first()).clone())) + } + )?; + + if !has_tier { + return Ok(Some("Too many resources allocated.".into())); + } + + // TODO: Validate build belongs to env/game + let Some((build, upload_complete)) = upload_res else { + return Ok(Some("Build not found.".into())); + }; + + if !upload_complete { + return Ok(Some("Build upload not complete.".into())); + } + + let Some(game_config) = game_config_res else { + return Ok(Some("Environment not found.".into())); + }; + + if matches!(input.network_mode, NetworkMode::Host) && !game_config.host_networking_enabled { + return Ok(Some("Host networking is not enabled for this game.".into())); + } + + if input.root_user_enabled && !game_config.root_user_enabled { + return Ok(Some( + "Docker root user is not enabled for this game.".into(), + )); + } + + if input.tags.len() > 8 { + return Ok(Some("Too many tags (max 8).".into())); + } + + for (k, v) in &input.tags { + if k.is_empty() { + return Ok(Some("tags[]: Tag label cannot be empty.".into())); + } + if k.len() > 32 { + return Ok(Some(format!( + "tags[{:?}]: Tag label too large (max 32 bytes).", + util::safe_slice(k, 0, 32), + ))); + } + if v.is_empty() { + return Ok(Some(format!("tags[{k:?}]: Tag value cannot be empty.",))); + } + if v.len() > 1024 { + return Ok(Some(format!( + "tags[{k:?}]: Tag value too large (max 1024 bytes)." + ))); + } + } + + if input.args.len() > 64 { + return Ok(Some("Too many arguments (max 64).".into())); + } + + for (i, arg) in input.args.iter().enumerate() { + if arg.len() > 256 { + return Ok(Some(format!( + "runtime.args[{i}]: Argument too large (max 256 bytes)." + ))); + } + } + + if input.environment.len() > 64 { + return Ok(Some("Too many environment variables (max 64).".into())); + } + + for (k, v) in &input.environment { + if k.len() > 256 { + return Ok(Some(format!( + "runtime.environment[{:?}]: Key too large (max 256 bytes).", + util::safe_slice(k, 0, 256), + ))); + } + if v.len() > 1024 { + return Ok(Some(format!( + "runtime.environment[{k:?}]: Value too large (max 1024 bytes)." + ))); + } + } + + if input.network_ports.len() > 8 { + return Ok(Some("Too many ports (max 8).".into())); + } + + for (name, port) in &input.network_ports { + if name.len() > 16 { + return Ok(Some(format!( + "runtime.ports[{:?}]: Port name too large (max 16 bytes).", + util::safe_slice(name, 0, 16), + ))); + } + + match input.network_mode { + NetworkMode::Bridge => { + // NOTE: Temporary validation until we implement bridge networking for isolates + if let BuildKind::JavaScript = build.kind { + if port.internal_port.is_some() { + return Ok(Some(format!( + "runtime.ports[{name:?}].internal_port: Must be null when `network.mode` = \"bridge\" and using a JS build.", + ))); + } + } + } + NetworkMode::Host => { + if port.internal_port.is_some() { + return Ok(Some(format!( + "runtime.ports[{name:?}].internal_port: Must be null when `network.mode` = \"host\".", + ))); + } + } + } + } + + Ok(None) +} + +#[derive(Debug, Clone, Serialize, Deserialize, Hash)] +pub struct DisableTlsPortsInput { + pub network_ports: util::serde::HashableMap, +} + +/// If TLS is not enabled in the cluster, we downgrade all protocols to the non-TLS equivalents. +/// This allows developers to develop locally with the same code they would use in production. +#[activity(DisableTlsPorts)] +pub async fn disable_tls_ports( + ctx: &ActivityCtx, + input: &DisableTlsPortsInput, +) -> GlobalResult> { + if ctx.config().server()?.rivet.guard.tls_enabled() { + // Do nothing + Ok(input.network_ports.clone()) + } else { + // Downgrade all TLS protocols to non-TLS protocols + let network_ports = input + .network_ports + .clone() + .into_iter() + .map(|(k, p)| { + ( + k, + Port { + internal_port: p.internal_port, + routing: match p.routing { + Routing::GameGuard { protocol } => Routing::GameGuard { + protocol: match protocol { + GameGuardProtocol::Https => GameGuardProtocol::Http, + GameGuardProtocol::TcpTls => GameGuardProtocol::Tcp, + x @ (GameGuardProtocol::Http + | GameGuardProtocol::Tcp + | GameGuardProtocol::Udp) => x, + }, + }, + x @ Routing::Host { .. } => x, + }, + }, + ) + }) + .collect(); + + Ok(network_ports) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, Hash)] +struct InsertDbInput { + actor_id: Uuid, + env_id: Uuid, + tags: util::serde::HashableMap, + resources: ActorResources, + lifecycle: ActorLifecycle, + image_id: Uuid, + args: Vec, + network_mode: NetworkMode, + environment: util::serde::HashableMap, + network_ports: util::serde::HashableMap, +} + +#[activity(InsertDb)] +async fn insert_db(ctx: &ActivityCtx, input: &InsertDbInput) -> GlobalResult { + let pool = ctx.sqlite().await?; + let mut conn = pool.conn().await?; + let mut tx = conn.begin().await?; + let create_ts = ctx.ts(); + + sql_execute!( + [ctx, @tx &mut tx] + " + INSERT INTO state ( + env_id, + tags, + resources_cpu_millicores, + resources_memory_mib, + lifecycle_kill_timeout_ms, + lifecycle_durable, + create_ts, + image_id, + args, + network_mode, + environment + ) + VALUES (?, jsonb(?), ?, ?, ?, ?, ?, ?, jsonb(?), ?, jsonb(?)) + ", + input.env_id, + serde_json::to_string(&input.tags)?, + input.resources.cpu_millicores as i32, + input.resources.memory_mib as i32, + input.lifecycle.kill_timeout_ms, + input.lifecycle.durable, + create_ts, + input.image_id, + serde_json::to_string(&input.args)?, + input.network_mode as i32, + serde_json::to_string(&input.environment)?, + ) + .await?; + + // Count up ports per protocol + let mut port_counts = Vec::new(); + for (_, port) in &input.network_ports { + match port.routing { + Routing::GameGuard { + protocol: + protocol @ (GameGuardProtocol::Tcp + | GameGuardProtocol::TcpTls + | GameGuardProtocol::Udp), + } => { + if let Some((_, count)) = port_counts.iter_mut().find(|(p, _)| &protocol == p) { + *count += 1; + } else { + port_counts.push((protocol, 1)); + } + } + _ => {} + } + } + + // TODO: Move this from an op to an activity, and move the sql queries after to their own activity + // Choose which port to assign for a job's ingress port. + // This is required because TCP and UDP do not have a `Host` header and thus cannot be re-routed by hostname. + // + // If not provided by `ProxiedPort`, then: + // - HTTP: 80 + // - HTTPS: 443 + // - TCP/TLS: random + // - UDP: random + let ingress_ports_res = ctx + .op(crate::ops::actor::allocate_ingress_ports::Input { + actor_id: input.actor_id, + ports: port_counts, + }) + .await?; + let mut ingress_ports = ingress_ports_res + .ports + .into_iter() + .map(|(protocol, ports)| (protocol, ports.into_iter())) + .collect::>(); + + let gg_config = &ctx.config().server()?.rivet.guard; + for (name, port) in input.network_ports.iter() { + match port.routing { + Routing::GameGuard { protocol } => { + sql_execute!( + [ctx, @tx &mut tx] + " + INSERT INTO ports_ingress ( + port_name, + port_number, + protocol, + ingress_port_number + ) + VALUES (?, ?, ?, ?) + ", + name, + port.internal_port.map(|x| x as i32), + protocol as i32, + match protocol { + GameGuardProtocol::Http => gg_config.http_port(), + GameGuardProtocol::Https => gg_config.https_port(), + GameGuardProtocol::Tcp | GameGuardProtocol::TcpTls | GameGuardProtocol::Udp => { + let (_, ports_iter) = unwrap!( + ingress_ports.iter_mut().find(|(p, _)| &protocol == p) + ); + unwrap!(ports_iter.next(), "missing ingress port") + }, + } as i32, + ) + .await?; + } + Routing::Host { protocol } => { + sql_execute!( + [ctx, @tx &mut tx] + " + INSERT INTO ports_host ( + port_name, + port_number, + protocol + ) + VALUES (?, ?, ?) + ", + name, + port.internal_port.map(|x| x as i32), + protocol as i32, + ) + .await?; + } + }; + } + + tx.commit().await?; + + Ok(create_ts) +} + +#[derive(Debug, Clone, Serialize, Deserialize, Hash)] +struct InsertFdbInput { + actor_id: Uuid, + env_id: Uuid, + tags: util::serde::HashableMap, + create_ts: i64, +} + +#[activity(InsertFdb)] +async fn insert_fdb(ctx: &ActivityCtx, input: &InsertFdbInput) -> GlobalResult<()> { + ctx.fdb() + .await? + .run(|tx, _mc| async move { + let create_ts_key = keys::actor::CreateTsKey::new(input.actor_id); + tx.set( + &keys::subspace().pack(&create_ts_key), + &create_ts_key + .serialize(input.create_ts) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + let workflow_id_key = keys::actor::WorkflowIdKey::new(input.actor_id); + tx.set( + &keys::subspace().pack(&workflow_id_key), + &workflow_id_key + .serialize(ctx.workflow_id()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + // Add env index key + let env_actor_key = + keys::env::ActorKey::new(input.env_id, input.create_ts, input.actor_id); + let data = keys::env::ActorKeyData { + is_destroyed: false, + tags: input.tags.clone().into_iter().collect(), + }; + tx.set( + &keys::subspace().pack(&env_actor_key), + &env_actor_key + .serialize(data) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + Ok(()) + }) + .custom_instrument(tracing::info_span!("actor_insert_tx")) + .await?; + + Ok(()) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +struct GetMetaInput { + env_id: Uuid, + image_id: Uuid, +} + +#[derive(Clone, Debug, Serialize, Deserialize, Hash)] +pub struct GetMetaOutput { + pub project_id: Uuid, + pub project_slug: String, + pub env_slug: String, + pub build_upload_id: Uuid, + pub build_file_name: String, + pub build_kind: BuildKind, + pub build_compression: BuildCompression, + pub build_allocation_type: BuildAllocationType, + pub build_allocation_total_slots: u64, + pub dc_name_id: String, + pub dc_display_name: String, + pub dc_build_delivery_method: BuildDeliveryMethod, +} + +#[activity(GetMeta)] +async fn get_meta(ctx: &ActivityCtx, input: &GetMetaInput) -> GlobalResult { + let dc_id = ctx.config().server()?.rivet.edge()?.datacenter_id; + + let (env_res, build_res, dc_res) = tokio::try_join!( + op!([ctx] game_namespace_get { + namespace_ids: vec![input.env_id.into()], + }), + ctx.op(build::ops::get::Input { + build_ids: vec![input.image_id], + }), + ctx.op(cluster::ops::datacenter::get::Input { + datacenter_ids: vec![dc_id], + }) + )?; + let env = unwrap_with!(env_res.namespaces.first(), ENVIRONMENT_NOT_FOUND); + let build = unwrap_with!(build_res.builds.first(), BUILD_NOT_FOUND); + let dc = unwrap!(dc_res.datacenters.first()); + + // Lookup project + let project_id = unwrap!(env.game_id).as_uuid(); + let projects_res = op!([ctx] game_get { + game_ids: vec![project_id.into()], + }) + .await?; + let project = unwrap!(projects_res.games.first()); + + Ok(GetMetaOutput { + project_id, + project_slug: project.name_id.clone(), + env_slug: env.name_id.clone(), + build_upload_id: build.upload_id, + build_file_name: build::utils::file_name(build.kind, build.compression), + build_kind: build.kind, + build_compression: build.compression, + build_allocation_type: build.allocation_type, + build_allocation_total_slots: build.allocation_total_slots, + dc_name_id: dc.name_id.clone(), + dc_display_name: dc.display_name.clone(), + dc_build_delivery_method: dc.build_delivery_method, + }) +} + +pub enum SetupCtx { + Init { + network_ports: util::serde::HashableMap, + }, + Reschedule { + image_id: Uuid, + }, +} + +#[derive(Clone)] +pub struct ActorSetupCtx { + pub image_id: Uuid, + pub meta: GetMetaOutput, + pub resources: protocol::Resources, + pub artifact_url_stub: String, + pub fallback_artifact_url: Option, +} + +pub async fn setup( + ctx: &mut WorkflowCtx, + input: &Input, + setup: SetupCtx, +) -> GlobalResult { + let image_id = match setup { + SetupCtx::Init { network_ports } => { + let tags = input.tags.as_hashable(); + let create_ts = ctx + .activity(InsertDbInput { + actor_id: input.actor_id, + env_id: input.env_id, + tags: tags.clone(), + resources: input.resources.clone(), + lifecycle: input.lifecycle.clone(), + image_id: input.image_id, + args: input.args.clone(), + network_mode: input.network_mode, + environment: input.environment.as_hashable(), + network_ports, + }) + .await?; + + ctx.activity(InsertFdbInput { + actor_id: input.actor_id, + env_id: input.env_id, + tags, + create_ts, + }) + .await?; + + input.image_id + } + SetupCtx::Reschedule { image_id } => image_id, + }; + + let meta = ctx + .activity(GetMetaInput { + env_id: input.env_id, + image_id, + }) + .await?; + + let (resources, artifacts_res) = ctx + .join(( + activity(SelectResourcesInput { + resources: input.resources.clone(), + }), + activity(ResolveArtifactsInput { + build_upload_id: meta.build_upload_id, + build_file_name: meta.build_file_name.clone(), + dc_build_delivery_method: meta.dc_build_delivery_method, + }), + )) + .await?; + + Ok(ActorSetupCtx { + image_id, + meta, + resources, + artifact_url_stub: artifacts_res.artifact_url_stub, + fallback_artifact_url: artifacts_res.fallback_artifact_url, + }) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +struct SelectResourcesInput { + resources: ActorResources, +} + +#[activity(SelectResources)] +async fn select_resources( + ctx: &ActivityCtx, + input: &SelectResourcesInput, +) -> GlobalResult { + let dc_id = ctx.config().server()?.rivet.edge()?.datacenter_id; + + let tier_res = ctx + .op(tier::ops::list::Input { + datacenter_ids: vec![dc_id], + pegboard: true, + }) + .await?; + let tier_dc = unwrap!(tier_res.datacenters.first()); + let mut tiers = tier_dc.tiers.iter().collect::>(); + + // Sort the tiers by cpu + tiers.sort_by(|a, b| a.cpu.cmp(&b.cpu)); + + // Find the first tier that has more CPU and memory than the requested + // resources + let tier = unwrap!( + tiers.iter().find(|t| { + t.cpu_millicores >= input.resources.cpu_millicores + && t.memory >= input.resources.memory_mib + }), + "no suitable tier found" + ); + + // runc-compatible resources + let cpu = tier.rivet_cores_numerator as u64 * 1_000 / tier.rivet_cores_denominator as u64; // Millicore (1/1000 of a core) + let memory = tier.memory as u64 * (1024 * 1024); + let memory_max = tier.memory_max as u64 * (1024 * 1024); + + let pool = ctx.sqlite().await?; + + // Write to db + sql_execute!( + [ctx, pool] + " + UPDATE state + SET + selected_resources_cpu_millicores = ?, + selected_resources_memory_mib = ? + ", + i64::try_from(cpu)?, + i64::try_from(tier.memory)?, + ) + .await?; + + Ok(protocol::Resources { + cpu, + memory, + memory_max, + disk: tier.disk, + }) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +struct ResolveArtifactsInput { + build_upload_id: Uuid, + build_file_name: String, + dc_build_delivery_method: BuildDeliveryMethod, +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +struct ResolveArtifactsOutput { + artifact_url_stub: String, + fallback_artifact_url: Option, +} + +#[activity(ResolveArtifacts)] +async fn resolve_artifacts( + ctx: &ActivityCtx, + input: &ResolveArtifactsInput, +) -> GlobalResult { + let fallback_artifact_url = + if let BuildDeliveryMethod::S3Direct = input.dc_build_delivery_method { + tracing::debug!("using s3 direct delivery"); + + // Build client + let s3_client = s3_util::Client::with_bucket_and_endpoint( + ctx.config(), + "bucket-build", + s3_util::EndpointKind::EdgeInternal, + ) + .await?; + + let presigned_req = s3_client + .get_object() + .bucket(s3_client.bucket()) + .key(format!( + "{upload_id}/{file_name}", + upload_id = input.build_upload_id, + file_name = input.build_file_name, + )) + .presigned( + s3_util::aws_sdk_s3::presigning::PresigningConfig::builder() + .expires_in(std::time::Duration::from_secs(15 * 60)) + .build()?, + ) + .await?; + + let addr_str = presigned_req.uri().to_string(); + tracing::debug!(addr = %addr_str, "resolved artifact s3 presigned request"); + + Some(addr_str) + } else { + None + }; + + Ok(ResolveArtifactsOutput { + artifact_url_stub: crate::util::image_artifact_url_stub( + ctx.config(), + input.build_upload_id, + &input.build_file_name, + )?, + fallback_artifact_url, + }) +} diff --git a/packages/edge/services/pegboard/src/workflows/client/mod.rs b/packages/edge/services/pegboard/src/workflows/client/mod.rs index 0f9fa637a8..5899b8ac11 100644 --- a/packages/edge/services/pegboard/src/workflows/client/mod.rs +++ b/packages/edge/services/pegboard/src/workflows/client/mod.rs @@ -772,6 +772,9 @@ pub async fn handle_commands( } } } + protocol::Command::SignalRunner { .. } => { + // No-op in this workflow + } } } diff --git a/packages/edge/services/pegboard/src/workflows/mod.rs b/packages/edge/services/pegboard/src/workflows/mod.rs index 69eb77c1dc..9e896b1c34 100644 --- a/packages/edge/services/pegboard/src/workflows/mod.rs +++ b/packages/edge/services/pegboard/src/workflows/mod.rs @@ -1,2 +1,3 @@ pub mod actor; +pub mod actor2; pub mod client;