Skip to content

Commit de065a8

Browse files
authored
oximeter integration (#178)
1 parent 1d8b818 commit de065a8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+3486
-732
lines changed

Cargo.lock

Lines changed: 968 additions & 572 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,14 @@ ciborium = "0.2"
7777
http = "0.2"
7878
humantime = "2.1"
7979
rand = "0.8"
80+
backoff = "0.4"
8081
mg-common = { path = "mg-common" }
8182
chrono = { version = "0.4", features = ["serde"] }
83+
oximeter = { git = "https://github.com/oxidecomputer/omicron", branch = "main"}
84+
oximeter-producer = { git = "https://github.com/oxidecomputer/omicron", branch= "main"}
85+
omicron-common = { git = "https://github.com/oxidecomputer/omicron", branch= "main"}
86+
internal-dns = { git = "https://github.com/oxidecomputer/omicron", branch = "main"}
87+
uuid = { version = "1.7", features = ["serde", "v4"] }
8288

8389
[workspace.dependencies.opte-ioctl]
8490
git = "https://github.com/oxidecomputer/opte"

bfd/src/lib.rs

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ use slog::{warn, Logger};
1010
use sm::StateMachine;
1111
use std::collections::HashMap;
1212
use std::net::IpAddr;
13+
use std::sync::atomic::AtomicU64;
14+
use std::sync::Arc;
1315
use std::time::Duration;
1416

1517
pub mod bidi;
@@ -76,10 +78,29 @@ impl Daemon {
7678
}
7779
}
7880

81+
#[derive(Default)]
82+
pub struct SessionCounters {
83+
pub control_packets_sent: AtomicU64,
84+
pub control_packet_send_failures: AtomicU64,
85+
pub control_packets_received: AtomicU64,
86+
pub admin_down_status_received: AtomicU64,
87+
pub down_status_received: AtomicU64,
88+
pub init_status_received: AtomicU64,
89+
pub up_status_received: AtomicU64,
90+
pub unknown_status_received: AtomicU64,
91+
pub transition_to_init: AtomicU64,
92+
pub transition_to_down: AtomicU64,
93+
pub transition_to_up: AtomicU64,
94+
pub timeout_expired: AtomicU64,
95+
pub message_receive_error: AtomicU64,
96+
pub unexpected_message: AtomicU64,
97+
}
98+
7999
/// A session holds a BFD state machine for a particular peer.
80100
pub struct Session {
81101
pub sm: StateMachine,
82102
pub mode: SessionMode,
103+
pub counters: Arc<SessionCounters>,
83104
}
84105

85106
impl Session {
@@ -94,10 +115,16 @@ impl Session {
94115
db: rdb::Db,
95116
log: Logger,
96117
) -> Self {
97-
let mut sm =
98-
StateMachine::new(addr, required_rx, detection_multiplier, log);
118+
let counters = Arc::new(SessionCounters::default());
119+
let mut sm = StateMachine::new(
120+
addr,
121+
required_rx,
122+
detection_multiplier,
123+
counters.clone(),
124+
log,
125+
);
99126
sm.run(ep, db);
100-
Session { sm, mode }
127+
Session { sm, mode, counters }
101128
}
102129
}
103130

bfd/src/sm.rs

Lines changed: 105 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
// License, v. 2.0. If a copy of the MPL was not distributed with this
33
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
44

5-
use crate::err;
6-
use crate::packet::Control;
5+
use crate::packet::{Control, State as PacketState};
76
use crate::{
87
bidi, inf, packet, trc, util::update_peer_info, wrn, BfdPeerState, PeerInfo,
98
};
9+
use crate::{err, SessionCounters};
1010
use anyhow::{anyhow, Result};
1111
use slog::{warn, Logger};
1212
use std::net::IpAddr;
@@ -48,6 +48,7 @@ pub struct StateMachine {
4848
required_rx: Duration,
4949
detection_multiplier: u8,
5050
kill_switch: Arc<AtomicBool>,
51+
counters: Arc<SessionCounters>,
5152
log: Logger,
5253
}
5354

@@ -64,6 +65,7 @@ impl StateMachine {
6465
peer: IpAddr,
6566
required_rx: Duration,
6667
detection_multiplier: u8,
68+
counters: Arc<SessionCounters>,
6769
log: Logger,
6870
) -> Self {
6971
let state = Down::new(peer, log.clone());
@@ -73,6 +75,7 @@ impl StateMachine {
7375
required_rx,
7476
detection_multiplier,
7577
kill_switch: Arc::new(AtomicBool::new(false)),
78+
counters,
7679
log,
7780
}
7881
}
@@ -122,6 +125,7 @@ impl StateMachine {
122125
let peer = self.peer;
123126
let kill_switch = self.kill_switch.clone();
124127
let log = self.log.clone();
128+
let counters = self.counters.clone();
125129
spawn(move || loop {
126130
let prev = state.read().unwrap().state();
127131
let (st, ep) = match state.read().unwrap().run(
@@ -130,6 +134,7 @@ impl StateMachine {
130134
remote.clone(),
131135
kill_switch.clone(),
132136
db.clone(),
137+
counters.clone(),
133138
) {
134139
Ok(result) => result,
135140
Err(_) => break,
@@ -143,6 +148,23 @@ impl StateMachine {
143148
}
144149

145150
if prev != new {
151+
match new {
152+
BfdPeerState::AdminDown | BfdPeerState::Down => {
153+
counters
154+
.transition_to_down
155+
.fetch_add(1, Ordering::Relaxed);
156+
}
157+
BfdPeerState::Init => {
158+
counters
159+
.transition_to_init
160+
.fetch_add(1, Ordering::Relaxed);
161+
}
162+
BfdPeerState::Up => {
163+
counters
164+
.transition_to_up
165+
.fetch_add(1, Ordering::Relaxed);
166+
}
167+
}
146168
inf!(log, prev, peer; "transition -> {:?}", new);
147169
}
148170
});
@@ -162,6 +184,7 @@ impl StateMachine {
162184
let peer = self.peer;
163185
let stop = self.kill_switch.clone();
164186
let log = self.log.clone();
187+
let counters = self.counters.clone();
165188
// State does not change for the lifetime of the trait so it's safe to
166189
// just copy it out of self for sending into the spawned thread. The
167190
// reason this is a dynamic method at all is to get runtime polymorphic
@@ -203,6 +226,13 @@ impl StateMachine {
203226

204227
if let Err(e) = sender.send((peer, pkt)) {
205228
wrn!(log, st, peer; "send: {}", e);
229+
counters
230+
.control_packet_send_failures
231+
.fetch_add(1, Ordering::Relaxed);
232+
} else {
233+
counters
234+
.control_packets_sent
235+
.fetch_add(1, Ordering::Relaxed);
206236
}
207237
});
208238
}
@@ -255,6 +285,7 @@ pub(crate) trait State: Sync + Send {
255285
remote: Arc<Mutex<PeerInfo>>,
256286
kill_switch: Arc<AtomicBool>,
257287
db: rdb::Db,
288+
counters: Arc<SessionCounters>,
258289
) -> Result<(Box<dyn State>, BfdEndpoint)>;
259290

260291
/// Return the `BfdPeerState` associated with the implementor of this trait.
@@ -297,13 +328,42 @@ pub(crate) trait State: Sync + Send {
297328
local: PeerInfo,
298329
remote: &Arc<Mutex<PeerInfo>>,
299330
log: Logger,
331+
counters: Arc<SessionCounters>,
300332
) -> Result<RecvResult> {
301333
match endpoint.rx.recv_timeout(
302334
local.required_min_rx * local.detection_multiplier.into(),
303335
) {
304336
Ok((addr, msg)) => {
305337
trc!(log, self.state(), self.peer(); "recv: {:?}", msg);
306338

339+
match msg.state() {
340+
PacketState::Peer(BfdPeerState::AdminDown) => {
341+
counters
342+
.admin_down_status_received
343+
.fetch_add(1, Ordering::Relaxed);
344+
}
345+
PacketState::Peer(BfdPeerState::Down) => {
346+
counters
347+
.down_status_received
348+
.fetch_add(1, Ordering::Relaxed);
349+
}
350+
PacketState::Peer(BfdPeerState::Init) => {
351+
counters
352+
.init_status_received
353+
.fetch_add(1, Ordering::Relaxed);
354+
}
355+
PacketState::Peer(BfdPeerState::Up) => {
356+
counters
357+
.up_status_received
358+
.fetch_add(1, Ordering::Relaxed);
359+
}
360+
PacketState::Unknown(_) => {
361+
counters
362+
.unknown_status_received
363+
.fetch_add(1, Ordering::Relaxed);
364+
}
365+
}
366+
307367
update_peer_info(remote, &msg);
308368

309369
if msg.poll() {
@@ -320,6 +380,7 @@ pub(crate) trait State: Sync + Send {
320380
}
321381
Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
322382
wrn!(log, self.state(), self.peer(); "timeout expired");
383+
counters.timeout_expired.fetch_add(1, Ordering::Relaxed);
323384
let next = Down::new(self.peer(), log.clone());
324385
Ok(RecvResult::TransitionTo(Box::new(next)))
325386
}
@@ -331,6 +392,9 @@ pub(crate) trait State: Sync + Send {
331392
"recv: {}, exiting recieve loop",
332393
e
333394
);
395+
counters
396+
.message_receive_error
397+
.fetch_add(1, Ordering::Relaxed);
334398
Err(anyhow::anyhow!("recv channel closed"))
335399
}
336400
}
@@ -372,6 +436,7 @@ impl State for Down {
372436
remote: Arc<Mutex<PeerInfo>>,
373437
kill_switch: Arc<AtomicBool>,
374438
db: rdb::Db,
439+
counters: Arc<SessionCounters>,
375440
) -> Result<(Box<dyn State>, BfdEndpoint)> {
376441
match self.peer {
377442
IpAddr::V4(addr) => db.disable_nexthop4(addr),
@@ -384,13 +449,18 @@ impl State for Down {
384449
}
385450
loop {
386451
// Get an incoming message
387-
let (_addr, msg) =
388-
match self.recv(&endpoint, local, &remote, self.log.clone())? {
389-
RecvResult::MessageFrom((addr, control)) => (addr, control),
390-
RecvResult::TransitionTo(state) => {
391-
return Ok((state, endpoint))
392-
}
393-
};
452+
let (_addr, msg) = match self.recv(
453+
&endpoint,
454+
local,
455+
&remote,
456+
self.log.clone(),
457+
counters.clone(),
458+
)? {
459+
RecvResult::MessageFrom((addr, control)) => (addr, control),
460+
RecvResult::TransitionTo(state) => {
461+
return Ok((state, endpoint))
462+
}
463+
};
394464

395465
if kill_switch.load(Ordering::Relaxed) {
396466
return Err(anyhow!("killed"));
@@ -455,16 +525,22 @@ impl State for Init {
455525
remote: Arc<Mutex<PeerInfo>>,
456526
kill_switch: Arc<AtomicBool>,
457527
_db: rdb::Db,
528+
counters: Arc<SessionCounters>,
458529
) -> Result<(Box<dyn State>, BfdEndpoint)> {
459530
loop {
460531
// Get an incoming message
461-
let (_addr, msg) =
462-
match self.recv(&endpoint, local, &remote, self.log.clone())? {
463-
RecvResult::MessageFrom((addr, control)) => (addr, control),
464-
RecvResult::TransitionTo(state) => {
465-
return Ok((state, endpoint))
466-
}
467-
};
532+
let (_addr, msg) = match self.recv(
533+
&endpoint,
534+
local,
535+
&remote,
536+
self.log.clone(),
537+
counters.clone(),
538+
)? {
539+
RecvResult::MessageFrom((addr, control)) => (addr, control),
540+
RecvResult::TransitionTo(state) => {
541+
return Ok((state, endpoint))
542+
}
543+
};
468544

469545
if kill_switch.load(Ordering::Relaxed) {
470546
return Err(anyhow!("killed"));
@@ -527,6 +603,7 @@ impl State for Up {
527603
remote: Arc<Mutex<PeerInfo>>,
528604
kill_switch: Arc<AtomicBool>,
529605
db: rdb::Db,
606+
counters: Arc<SessionCounters>,
530607
) -> Result<(Box<dyn State>, BfdEndpoint)> {
531608
match self.peer {
532609
IpAddr::V4(addr) => db.enable_nexthop4(addr),
@@ -539,13 +616,18 @@ impl State for Up {
539616
}
540617
loop {
541618
// Get an incoming message
542-
let (_addr, msg) =
543-
match self.recv(&endpoint, local, &remote, self.log.clone())? {
544-
RecvResult::MessageFrom((addr, control)) => (addr, control),
545-
RecvResult::TransitionTo(state) => {
546-
return Ok((state, endpoint))
547-
}
548-
};
619+
let (_addr, msg) = match self.recv(
620+
&endpoint,
621+
local,
622+
&remote,
623+
self.log.clone(),
624+
counters.clone(),
625+
)? {
626+
RecvResult::MessageFrom((addr, control)) => (addr, control),
627+
RecvResult::TransitionTo(state) => {
628+
return Ok((state, endpoint))
629+
}
630+
};
549631

550632
if kill_switch.load(Ordering::Relaxed) {
551633
return Err(anyhow!("killed"));

0 commit comments

Comments
 (0)