Skip to content

Commit 65aa627

Browse files
committed
fix(guard): add cache clearing to ws retry, increase ws retries (#2448)
<!-- Please make sure there is an issue that this PR is correlated to. --> ## Changes <!-- If there are frontend changes, please include screenshots. -->
1 parent 7855096 commit 65aa627

File tree

2 files changed

+37
-12
lines changed

2 files changed

+37
-12
lines changed

packages/edge/infra/guard/core/src/proxy_service.rs

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -967,7 +967,7 @@ impl ProxyService {
967967
async fn handle_websocket_upgrade(
968968
&self,
969969
req: Request<BodyIncoming>,
970-
target: RouteTarget,
970+
mut target: RouteTarget,
971971
) -> GlobalResult<Response<Full<Bytes>>> {
972972
// Get actor and server IDs for metrics and middleware
973973
let actor_id = target.actor_id;
@@ -978,6 +978,19 @@ impl ProxyService {
978978
// Start timing the request (metrics already incremented in handle_request)
979979
let start_time = Instant::now();
980980

981+
// Parsed for retries later
982+
let req_host = req
983+
.headers()
984+
.get(hyper::header::HOST)
985+
.and_then(|h| h.to_str().ok())
986+
.unwrap_or("unknown")
987+
.to_string();
988+
let req_path = req
989+
.uri()
990+
.path_and_query()
991+
.map(|x| x.to_string())
992+
.unwrap_or_else(|| req.uri().path().to_string());
993+
981994
// Log request details
982995
info!("WebSocket upgrade request for path: {}, target host: {}:{}, actor_id: {}, server_id: {}",
983996
target.path, target.host, target.port, actor_id_str, server_id_str);
@@ -1044,11 +1057,8 @@ impl ProxyService {
10441057
}
10451058
}
10461059

1047-
// Now we need to connect to the upstream WebSocket server
1048-
let target_url = format!("ws://{}:{}{}", target.host, target.port, target.path);
1049-
info!("Target upstream WebSocket URL: {}", target_url);
1050-
10511060
// Clone needed values for the spawned task
1061+
let state = self.state.clone();
10521062
let actor_id_str_clone = actor_id_str.clone();
10531063
let server_id_str_clone = server_id_str.clone();
10541064
let path = target.path.clone();
@@ -1110,12 +1120,11 @@ impl ProxyService {
11101120
};
11111121

11121122
// Now attempt to connect to the upstream server
1113-
info!(
1114-
"Attempting to connect to upstream WebSocket at {}",
1115-
target_url
1116-
);
1123+
info!("Attempting connect to upstream WebSocket");
11171124
while attempts < max_attempts {
11181125
attempts += 1;
1126+
1127+
let target_url = format!("ws://{}:{}{}", target.host, target.port, target.path);
11191128
info!(
11201129
"WebSocket request attempt {}/{} to {}",
11211130
attempts, max_attempts, target_url
@@ -1196,7 +1205,23 @@ impl ProxyService {
11961205
// Use backoff for the next attempt
11971206
let backoff = Self::calculate_backoff(attempts, initial_interval);
11981207
info!("Waiting for {:?} before next connection attempt", backoff);
1199-
tokio::time::sleep(backoff).await;
1208+
1209+
let (_, new_target) = tokio::join!(
1210+
tokio::time::sleep(backoff),
1211+
// Resolve target again, this time ignoring cache. This makes sure
1212+
// we always re-fetch the route on error
1213+
state.resolve_route(&req_host, &req_path, state.port_type.clone(), true,),
1214+
);
1215+
1216+
match new_target {
1217+
Ok(ResolveRouteOutput::Target(new_target)) => {
1218+
target = new_target;
1219+
}
1220+
Ok(ResolveRouteOutput::Response(_response)) => {
1221+
error!("Expected target, got response")
1222+
}
1223+
Err(e) => error!("Routing error: {}", e),
1224+
}
12001225
}
12011226

12021227
// If we couldn't connect to the upstream server, exit the task

packages/edge/infra/guard/server/src/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ fn create_middleware_function(
127127
amount: 20, // 20 concurrent requests
128128
},
129129
retry: RetryConfig {
130-
max_attempts: 4, // 3 retry attempts
131-
initial_interval: 250, // 100ms initial interval
130+
max_attempts: 7,
131+
initial_interval: 150,
132132
},
133133
timeout: TimeoutConfig {
134134
request_timeout: 30, // 30 seconds for requests

0 commit comments

Comments
 (0)