Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
6303576
building the template (almost) works, with minimal changes!
djeebus Sep 29, 2025
4251937
nearly there
djeebus Sep 29, 2025
c10d42f
so close!
djeebus Sep 29, 2025
2020c5e
successfully built!
djeebus Sep 30, 2025
53741ef
a little more clean up
djeebus Sep 30, 2025
f446d92
clean up error message
djeebus Sep 30, 2025
66a7947
consistent tests
djeebus Sep 30, 2025
f0be207
gather env vars just-in-time rather than on init()
djeebus Sep 30, 2025
964c811
Merge branch 'just-in-time-env-vars' into test-harness
djeebus Sep 30, 2025
d345ae3
back to benchmark
djeebus Sep 30, 2025
6ea705c
progress
djeebus Sep 30, 2025
8edaf97
only build templates when not yet built
djeebus Oct 1, 2025
394528a
Merge branch 'main' into test-harness
djeebus Oct 1, 2025
82074fd
fix some issues plus some clean up
djeebus Oct 1, 2025
23ebdcc
add some instructions
djeebus Oct 1, 2025
37b9143
enable huge pages, remove useless file
djeebus Oct 2, 2025
192d4c3
revert some changes
djeebus Oct 2, 2025
82ce513
stop using b.Cleanup
djeebus Oct 2, 2025
86dd6bb
add tracing when useful
djeebus Oct 2, 2025
8d38265
Merge branch 'main' into test-harness
djeebus Oct 2, 2025
a870eb6
linting
djeebus Oct 2, 2025
b93f05c
fix tracer
djeebus Oct 2, 2025
b953305
Merge branch 'main' into test-harness
djeebus Oct 2, 2025
58927b2
Merge branch 'main' into test-harness
djeebus Oct 7, 2025
ecdce77
Merge branch 'main' into test-harness
djeebus Oct 7, 2025
ef48758
fix compilation issue
djeebus Oct 8, 2025
eb619e1
clean up
djeebus Oct 8, 2025
1b23122
check the status code. thanks cursor!
djeebus Oct 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
379 changes: 379 additions & 0 deletions packages/orchestrator/benchmark_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,379 @@
// run with something like:
//
// sudo `which go` test -benchtime=15s -bench=. -v
// sudo modprobe nbd
// echo 1024 | sudo tee /proc/sys/vm/nr_hugepages
package main

import (
"net/http"
"net/url"
"os"
"path/filepath"
"testing"
"time"

"github.com/google/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/metric/noop"
"go.uber.org/zap"

"github.com/e2b-dev/infra/packages/orchestrator/internal/proxy"
"github.com/e2b-dev/infra/packages/orchestrator/internal/sandbox"
blockmetrics "github.com/e2b-dev/infra/packages/orchestrator/internal/sandbox/block/metrics"
"github.com/e2b-dev/infra/packages/orchestrator/internal/sandbox/nbd"
"github.com/e2b-dev/infra/packages/orchestrator/internal/sandbox/network"
"github.com/e2b-dev/infra/packages/orchestrator/internal/sandbox/template"
"github.com/e2b-dev/infra/packages/orchestrator/internal/template/build"
"github.com/e2b-dev/infra/packages/orchestrator/internal/template/build/config"
"github.com/e2b-dev/infra/packages/orchestrator/internal/template/build/metrics"
artifactsregistry "github.com/e2b-dev/infra/packages/shared/pkg/artifacts-registry"
"github.com/e2b-dev/infra/packages/shared/pkg/dockerhub"
featureflags "github.com/e2b-dev/infra/packages/shared/pkg/feature-flags"
"github.com/e2b-dev/infra/packages/shared/pkg/limit"
sbxlogger "github.com/e2b-dev/infra/packages/shared/pkg/logger/sandbox"
"github.com/e2b-dev/infra/packages/shared/pkg/smap"
"github.com/e2b-dev/infra/packages/shared/pkg/storage"
"github.com/e2b-dev/infra/packages/shared/pkg/telemetry"
"github.com/e2b-dev/infra/packages/shared/pkg/utils"
)

var tracer = otel.Tracer("github.com/e2b-dev/infra/packages/orchestrator")

func BenchmarkBaseImageLaunch(b *testing.B) {
if os.Geteuid() != 0 {
b.Skip("skipping benchmark because not running as root")
}

// test configuration
const (
testType = onlyStart
baseImage = "e2bdev/base"
kernelVersion = "vmlinux-6.1.102"
fcVersion = "v1.10.1_1fcdaec08"
templateID = "fcb33d09-3141-42c4-8d3b-c2df411681db"
buildID = "ba6aae36-74f7-487a-b6f7-74fd7c94e479"
useHugePages = false
allowInternetAccess = true
)

// cache paths, to speed up test runs. these paths aren't wiped between tests
persistenceDir := filepath.Join(os.TempDir(), "e2b-orchestrator-benchmark")
kernelsDir := filepath.Join(persistenceDir, "kernels")
sandboxDir := filepath.Join(persistenceDir, "sandbox")
err := os.MkdirAll(kernelsDir, 0o755)
require.NoError(b, err)

// ephemeral data
tempDir := b.TempDir()
clientID := uuid.NewString()

abs := func(s string) string {
return utils.Must(filepath.Abs(s))
}

endpoint := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
if endpoint != "" {
spanExporter, err := telemetry.NewSpanExporter(b.Context(),
otlptracegrpc.WithEndpoint(endpoint),
)
defer func() {
err := spanExporter.Shutdown(b.Context())
assert.NoError(b, err)
}()
require.NoError(b, err)
resource, err := telemetry.GetResource(b.Context(), "node-id", "BenchmarkBaseImageLaunch", "service-commit", "service-version", "service-instance-id")
require.NoError(b, err)
tracerProvider := telemetry.NewTracerProvider(spanExporter, resource)
otel.SetTracerProvider(tracerProvider)
}

linuxKernelURL, err := url.JoinPath("https://storage.googleapis.com/e2b-prod-public-builds/kernels/", kernelVersion, "vmlinux.bin")
require.NoError(b, err)
linuxKernelFilename := filepath.Join(kernelsDir, kernelVersion, "vmlinux.bin")

downloadKernel(b, linuxKernelFilename, linuxKernelURL)

// hacks, these should go away
b.Setenv("ARTIFACTS_REGISTRY_PROVIDER", "Local")
b.Setenv("USE_LOCAL_NAMESPACE_STORAGE", "true")
b.Setenv("STORAGE_PROVIDER", "Local")
b.Setenv("ORCHESTRATOR_BASE_PATH", tempDir)
b.Setenv("HOST_ENVD_PATH", abs(filepath.Join("..", "envd", "bin", "envd")))
b.Setenv("FIRECRACKER_VERSIONS_DIR", abs(filepath.Join("..", "fc-versions", "builds")))
b.Setenv("HOST_KERNELS_DIR", abs(kernelsDir))
b.Setenv("SANDBOX_DIR", abs(sandboxDir))
b.Setenv("SNAPSHOT_CACHE_DIR", abs(filepath.Join(tempDir, "snapshot-cache")))
b.Setenv("LOCAL_TEMPLATE_STORAGE_BASE_PATH", abs(filepath.Join(persistenceDir, "templates")))

// prep directories
for _, subdir := range []string{"build", "build-templates" /*"fc-vm",*/, "sandbox", "snapshot-cache", "template"} {
fullDirName := filepath.Join(tempDir, subdir)
err := os.MkdirAll(fullDirName, 0o755)
require.NoError(b, err)
}

logger, err := zap.NewDevelopment()
require.NoError(b, err)

sbxlogger.SetSandboxLoggerInternal(logger)
// sbxlogger.SetSandboxLoggerExternal(logger)

networkPool, err := network.NewPool(
b.Context(), noop.MeterProvider{}, 8, 8, clientID,
)
require.NoError(b, err)
defer func() {
err := networkPool.Close(b.Context())
assert.NoError(b, err)
}()

devicePool, err := nbd.NewDevicePool(b.Context(), noop.MeterProvider{})
require.NoError(b, err, "do you have the nbd kernel module installed?")
defer func() {
err := devicePool.Close(b.Context())
assert.NoError(b, err)
}()

featureFlags, err := featureflags.NewClient()
require.NoError(b, err)
defer func() {
err := featureFlags.Close(b.Context())
assert.NoError(b, err)
}()

limiter, err := limit.New(b.Context(), featureFlags)
require.NoError(b, err)

persistence, err := storage.GetTemplateStorageProvider(b.Context(), limiter)
require.NoError(b, err)

blockMetrics, err := blockmetrics.NewMetrics(&noop.MeterProvider{})
require.NoError(b, err)

templateCache, err := template.NewCache(b.Context(), featureFlags, persistence, blockMetrics)
require.NoError(b, err)

sandboxFactory := sandbox.NewFactory(networkPool, devicePool, featureFlags, true)

dockerhubRepository, err := dockerhub.GetRemoteRepository(b.Context())
require.NoError(b, err)
defer func() {
err := dockerhubRepository.Close()
assert.NoError(b, err)
}()

accessToken := "access-token"
sandboxConfig := sandbox.Config{
BaseTemplateID: templateID,
Vcpu: 2,
RamMB: 512,
TotalDiskSizeMB: 2 * 1024,
HugePages: useHugePages,
AllowInternetAccess: ptr(allowInternetAccess),
Envd: sandbox.EnvdMetadata{
Vars: map[string]string{"HELLO": "WORLD"},
AccessToken: &accessToken,
Version: "1.2.3",
},
}

runtime := sandbox.RuntimeMetadata{
TemplateID: templateID,
SandboxID: "sandbox-id",
ExecutionID: "execution-id",
TeamID: "team-id",
}

artifactRegistry, err := artifactsregistry.GetArtifactsRegistryProvider(b.Context())
require.NoError(b, err)

persistenceTemplate, err := storage.GetTemplateStorageProvider(b.Context(), nil)
require.NoError(b, err)

persistenceBuild, err := storage.GetBuildCacheStorageProvider(b.Context(), nil)
require.NoError(b, err)

var proxyPort uint = 5007

sandboxes := smap.New[*sandbox.Sandbox]()

sandboxProxy, err := proxy.NewSandboxProxy(noop.MeterProvider{}, proxyPort, sandboxes)
require.NoError(b, err)
go func() {
err := sandboxProxy.Start(b.Context())
assert.ErrorIs(b, http.ErrServerClosed, err)
}()
defer func() {
err := sandboxProxy.Close(b.Context())
assert.NoError(b, err)
}()

buildMetrics, err := metrics.NewBuildMetrics(noop.MeterProvider{})
require.NoError(b, err)

builder := build.NewBuilder(
logger,
sandboxFactory,
persistenceTemplate,
persistenceBuild,
artifactRegistry,
dockerhubRepository,
sandboxProxy,
sandboxes,
templateCache,
buildMetrics,
)

buildPath := filepath.Join(os.Getenv("LOCAL_TEMPLATE_STORAGE_BASE_PATH"), buildID, "rootfs.ext4")
if _, err := os.Stat(buildPath); os.IsNotExist(err) {
// build template
force := true
templateConfig := config.TemplateConfig{
TemplateID: templateID,
FromImage: baseImage,
Force: &force,
VCpuCount: sandboxConfig.Vcpu,
MemoryMB: sandboxConfig.RamMB,
StartCmd: "echo 'start cmd debug' && sleep 10 && echo 'done starting command debug'",
DiskSizeMB: sandboxConfig.TotalDiskSizeMB,
HugePages: sandboxConfig.HugePages,
}

metadata := storage.TemplateFiles{
BuildID: buildID,
KernelVersion: kernelVersion,
FirecrackerVersion: fcVersion,
}
_, err = builder.Build(b.Context(), metadata, templateConfig, logger.Core())
require.NoError(b, err)
}

// retrieve template
tmpl, err := templateCache.GetTemplate(
b.Context(),
buildID,
kernelVersion,
fcVersion,
false,
false,
)
require.NoError(b, err)

tc := testContainer{
sandboxFactory: sandboxFactory,
testType: testType,
tmpl: tmpl,
sandboxConfig: sandboxConfig,
runtime: runtime,
}

for b.Loop() {
tc.testOneItem(b, buildID, kernelVersion, fcVersion)
}
}

func ptr[T any](v T) *T {
return &v
}

type testCycle string

const (
onlyStart testCycle = "only-start"
startAndPause testCycle = "start-and-pause"
startPauseResume testCycle = "start-pause-resume"
)

type testContainer struct {
testType testCycle
sandboxFactory *sandbox.Factory
tmpl template.Template
sandboxConfig sandbox.Config
runtime sandbox.RuntimeMetadata
}

func (tc *testContainer) testOneItem(b *testing.B, buildID, kernelVersion, fcVersion string) {
b.Helper()

ctx, span := tracer.Start(b.Context(), "testOneItem")
defer span.End()

sbx, err := tc.sandboxFactory.ResumeSandbox(
ctx,
tc.tmpl,
tc.sandboxConfig,
tc.runtime,
uuid.NewString(),
time.Now(),
time.Now().Add(time.Second*15),
nil,
)
require.NoError(b, err)

if tc.testType == onlyStart {
b.StopTimer()
err = sbx.Close(ctx)
require.NoError(b, err)
b.StartTimer()
return
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Benchmark Timer Resets During Iteration

The benchmark timer restarts right before testOneItem returns or finishes an iteration. This causes the timer to run during the next iteration's setup phase, leading to inaccurate benchmark results.

Fix in Cursor Fix in Web

}

meta, err := sbx.Template.Metadata()
require.NoError(b, err)

templateMetadata := meta.SameVersionTemplate(storage.TemplateFiles{
BuildID: buildID,
KernelVersion: kernelVersion,
FirecrackerVersion: fcVersion,
})
snap, err := sbx.Pause(ctx, templateMetadata)
require.NoError(b, err)
require.NotNil(b, snap)

if tc.testType == startAndPause {
b.StopTimer()
err = sbx.Close(ctx)
require.NoError(b, err)
b.StartTimer()
}

// resume sandbox
sbx, err = tc.sandboxFactory.ResumeSandbox(ctx, tc.tmpl, tc.sandboxConfig, tc.runtime, uuid.NewString(), time.Now(), time.Now().Add(time.Second*15), nil)
require.NoError(b, err)

// close sandbox
err = sbx.Close(ctx)
require.NoError(b, err)
}

func downloadKernel(b *testing.B, filename, url string) {
b.Helper()

dirname := filepath.Dir(filename)
err := os.MkdirAll(dirname, 0o755)
require.NoError(b, err)

// kernel already exists
if _, err := os.Stat(filename); err == nil {
return
}

client := &http.Client{}
req, err := http.NewRequestWithContext(b.Context(), http.MethodGet, url, nil)
require.NoError(b, err)
response, err := client.Do(req)
require.NoError(b, err)
require.Equal(b, http.StatusOK, response.StatusCode)
defer response.Body.Close()

file, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644)
require.NoError(b, err)
defer file.Close()

_, err = file.ReadFrom(response.Body)
require.NoError(b, err)
}
2 changes: 1 addition & 1 deletion packages/orchestrator/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ require (
go.opentelemetry.io/otel v1.38.0
go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.14.0
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0
go.opentelemetry.io/otel/metric v1.38.0
go.opentelemetry.io/otel/sdk/metric v1.38.0
go.opentelemetry.io/otel/trace v1.38.0
Expand Down Expand Up @@ -234,7 +235,6 @@ require (
go.opentelemetry.io/contrib/detectors/gcp v1.38.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 // indirect
go.opentelemetry.io/otel/log v0.14.0 // indirect
go.opentelemetry.io/otel/sdk v1.38.0 // indirect
go.opentelemetry.io/otel/sdk/log v0.14.0 // indirect
Expand Down
Loading