Skip to content

Commit befcf6d

Browse files
committed
feat(pegboard): add draining state to alloc metrics (#2565)
<!-- Please make sure there is an issue that this PR is correlated to. --> ## Changes <!-- If there are frontend changes, please include screenshots. -->
1 parent 86c6e41 commit befcf6d

File tree

3 files changed

+125
-24
lines changed

3 files changed

+125
-24
lines changed

packages/edge/services/pegboard/src/metrics.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,28 +11,28 @@ lazy_static::lazy_static! {
1111
pub static ref CLIENT_MEMORY_TOTAL: IntGaugeVec = register_int_gauge_vec_with_registry!(
1212
"pegboard_client_memory_total",
1313
"Total MiB of memory available on a client.",
14-
&["client_id", "flavor"],
14+
&["client_id", "flavor", "state"],
1515
*REGISTRY
1616
).unwrap();
1717

1818
pub static ref CLIENT_CPU_TOTAL: IntGaugeVec = register_int_gauge_vec_with_registry!(
1919
"pegboard_client_cpu_total",
2020
"Total millicores of cpu available on a client.",
21-
&["client_id", "flavor"],
21+
&["client_id", "flavor", "state"],
2222
*REGISTRY
2323
).unwrap();
2424

2525
pub static ref CLIENT_MEMORY_ALLOCATED: IntGaugeVec = register_int_gauge_vec_with_registry!(
2626
"pegboard_client_memory_allocated",
2727
"Total MiB of memory allocated on a client.",
28-
&["client_id", "flavor"],
28+
&["client_id", "flavor", "state"],
2929
*REGISTRY
3030
).unwrap();
3131

3232
pub static ref CLIENT_CPU_ALLOCATED: IntGaugeVec = register_int_gauge_vec_with_registry!(
3333
"pegboard_client_cpu_allocated",
3434
"Total millicores of cpu allocated on a client.",
35-
&["client_id", "flavor"],
35+
&["client_id", "flavor", "state"],
3636
*REGISTRY
3737
).unwrap();
3838

packages/edge/services/pegboard/src/workflows/actor/analytics.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,8 @@ pub async fn insert_clickhouse(
277277
selected_memory_mib: state_row.selected_resources_memory_mib.unwrap_or_default() as u32,
278278
root_user_enabled: state_row.root_user_enabled,
279279
env_vars: state_row.environment.len() as i64,
280-
env_var_bytes: state_row.environment
280+
env_var_bytes: state_row
281+
.environment
281282
.iter()
282283
.map(|(k, v)| k.len() + v.len())
283284
.sum::<usize>() as i64,

packages/edge/services/pegboard/src/workflows/client/mod.rs

Lines changed: 119 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ pub async fn pegboard_client(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResu
109109
activity(UpdateMetricsInput {
110110
client_id,
111111
flavor,
112+
draining: state.drain_timeout_ts.is_some(),
112113
clear: false,
113114
}),
114115
))
@@ -125,6 +126,7 @@ pub async fn pegboard_client(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResu
125126
activity(UpdateMetricsInput {
126127
client_id,
127128
flavor,
129+
draining: state.drain_timeout_ts.is_some(),
128130
clear: false,
129131
}),
130132
))
@@ -254,6 +256,7 @@ pub async fn pegboard_client(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResu
254256
ctx.activity(UpdateMetricsInput {
255257
client_id: input.client_id,
256258
flavor: input.flavor,
259+
draining: false,
257260
clear: true,
258261
})
259262
.await?;
@@ -691,6 +694,7 @@ pub async fn handle_commands(
691694
activity(UpdateMetricsInput {
692695
client_id,
693696
flavor,
697+
draining: drain_timeout_ts.is_some(),
694698
clear: false,
695699
}),
696700
))
@@ -933,24 +937,75 @@ async fn check_expired(ctx: &ActivityCtx, input: &CheckExpiredInput) -> GlobalRe
933937
struct UpdateMetricsInput {
934938
client_id: Uuid,
935939
flavor: ClientFlavor,
940+
#[serde(default)]
941+
draining: bool,
936942
clear: bool,
937943
}
938944

939945
#[activity(UpdateMetrics)]
940946
async fn update_metrics(ctx: &ActivityCtx, input: &UpdateMetricsInput) -> GlobalResult<()> {
941947
if input.clear {
948+
metrics::CLIENT_MEMORY_TOTAL
949+
.with_label_values(&[
950+
&input.client_id.to_string(),
951+
&input.flavor.to_string(),
952+
"active",
953+
])
954+
.set(0);
955+
metrics::CLIENT_CPU_TOTAL
956+
.with_label_values(&[
957+
&input.client_id.to_string(),
958+
&input.flavor.to_string(),
959+
"active",
960+
])
961+
.set(0);
962+
metrics::CLIENT_MEMORY_TOTAL
963+
.with_label_values(&[
964+
&input.client_id.to_string(),
965+
&input.flavor.to_string(),
966+
"draining",
967+
])
968+
.set(0);
969+
metrics::CLIENT_CPU_TOTAL
970+
.with_label_values(&[
971+
&input.client_id.to_string(),
972+
&input.flavor.to_string(),
973+
"draining",
974+
])
975+
.set(0);
942976
metrics::CLIENT_MEMORY_ALLOCATED
943-
.with_label_values(&[&input.client_id.to_string(), &input.flavor.to_string()])
977+
.with_label_values(&[
978+
&input.client_id.to_string(),
979+
&input.flavor.to_string(),
980+
"active",
981+
])
944982
.set(0);
945-
946983
metrics::CLIENT_CPU_ALLOCATED
947-
.with_label_values(&[&input.client_id.to_string(), &input.flavor.to_string()])
984+
.with_label_values(&[
985+
&input.client_id.to_string(),
986+
&input.flavor.to_string(),
987+
"active",
988+
])
989+
.set(0);
990+
metrics::CLIENT_MEMORY_ALLOCATED
991+
.with_label_values(&[
992+
&input.client_id.to_string(),
993+
&input.flavor.to_string(),
994+
"draining",
995+
])
996+
.set(0);
997+
metrics::CLIENT_CPU_ALLOCATED
998+
.with_label_values(&[
999+
&input.client_id.to_string(),
1000+
&input.flavor.to_string(),
1001+
"draining",
1002+
])
9481003
.set(0);
9491004

9501005
return Ok(());
9511006
}
9521007

953-
let (total_mem, total_cpu, remaining_mem, remaining_cpu) =
1008+
let (total_mem, remaining_mem, total_cpu, remaining_cpu) =
9541009
ctx.fdb()
9551010
.await?
9561011
.run(|tx, _mc| async move {
@@ -992,35 +1047,80 @@ async fn update_metrics(ctx: &ActivityCtx, input: &UpdateMetricsInput) -> Global
9921047
)
9931048
.map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?;
9941049

995-
Ok((
996-
total_mem,
997-
remaining_mem,
998-
total_cpu,
999-
remaining_cpu,
1000-
))
1050+
Ok((total_mem, remaining_mem, total_cpu, remaining_cpu))
10011051
})
10021052
.custom_instrument(tracing::info_span!("client_update_metrics_tx"))
10031053
.await?;
10041054

1055+
let (state, other_state) = if input.draining {
1056+
("draining", "active")
1057+
} else {
1058+
("active", "draining")
1059+
};
1060+
let allocated_mem = total_mem.saturating_sub(remaining_mem);
1061+
let allocated_cpu = total_cpu.saturating_sub(remaining_cpu);
1062+
10051063
metrics::CLIENT_MEMORY_TOTAL
1006-
.with_label_values(&[&input.client_id.to_string(), &input.flavor.to_string()])
1064+
.with_label_values(&[
1065+
&input.client_id.to_string(),
1066+
&input.flavor.to_string(),
1067+
state,
1068+
])
10071069
.set(total_mem.try_into()?);
1008-
10091070
metrics::CLIENT_CPU_TOTAL
1010-
.with_label_values(&[&input.client_id.to_string(), &input.flavor.to_string()])
1071+
.with_label_values(&[
1072+
&input.client_id.to_string(),
1073+
&input.flavor.to_string(),
1074+
state,
1075+
])
10111076
.set(total_cpu.try_into()?);
10121077

1013-
let allocated_mem = total_mem.saturating_sub(remaining_mem);
1014-
let allocated_cpu = total_cpu.saturating_sub(remaining_cpu);
1015-
10161078
metrics::CLIENT_MEMORY_ALLOCATED
1017-
.with_label_values(&[&input.client_id.to_string(), &input.flavor.to_string()])
1079+
.with_label_values(&[
1080+
&input.client_id.to_string(),
1081+
&input.flavor.to_string(),
1082+
state,
1083+
])
10181084
.set(allocated_mem.try_into()?);
1019-
10201085
metrics::CLIENT_CPU_ALLOCATED
1021-
.with_label_values(&[&input.client_id.to_string(), &input.flavor.to_string()])
1086+
.with_label_values(&[
1087+
&input.client_id.to_string(),
1088+
&input.flavor.to_string(),
1089+
state,
1090+
])
10221091
.set(allocated_cpu.try_into()?);
10231092

1093+
// Clear other state
1094+
metrics::CLIENT_MEMORY_TOTAL
1095+
.with_label_values(&[
1096+
&input.client_id.to_string(),
1097+
&input.flavor.to_string(),
1098+
other_state,
1099+
])
1100+
.set(0);
1101+
metrics::CLIENT_CPU_TOTAL
1102+
.with_label_values(&[
1103+
&input.client_id.to_string(),
1104+
&input.flavor.to_string(),
1105+
other_state,
1106+
])
1107+
.set(0);
1108+
1109+
metrics::CLIENT_MEMORY_ALLOCATED
1110+
.with_label_values(&[
1111+
&input.client_id.to_string(),
1112+
&input.flavor.to_string(),
1113+
other_state,
1114+
])
1115+
.set(0);
1116+
metrics::CLIENT_CPU_ALLOCATED
1117+
.with_label_values(&[
1118+
&input.client_id.to_string(),
1119+
&input.flavor.to_string(),
1120+
other_state,
1121+
])
1122+
.set(0);
1123+
10241124
Ok(())
10251125
}
10261126

0 commit comments

Comments
 (0)