diff --git a/lib/opte/src/d_error.rs b/lib/opte/src/d_error.rs index a558fe97..edbe61f4 100644 --- a/lib/opte/src/d_error.rs +++ b/lib/opte/src/d_error.rs @@ -145,7 +145,7 @@ impl LabelBlock { } /// Provides access to all stored [`CStr`]s. - pub fn entries<'a>(&'a self) -> LabelBlockIter<'a, L> { + pub fn entries(&self) -> LabelBlockIter<'_, L> { LabelBlockIter { pos: 0, inner: self } } diff --git a/lib/opte/src/engine/packet.rs b/lib/opte/src/engine/packet.rs index f0c2d89c..d9ac2107 100644 --- a/lib/opte/src/engine/packet.rs +++ b/lib/opte/src/engine/packet.rs @@ -469,7 +469,7 @@ impl PacketChain { /// Removes the next packet from the top of the chain and returns /// it, taking ownership. - pub fn next(&mut self) -> Option> { + pub fn pop_front(&mut self) -> Option> { if let Some(ref mut list) = &mut self.inner { unsafe { let curr = list.head.as_ptr(); @@ -554,7 +554,7 @@ impl Drop for PacketChain { unsafe { ddi::freemsgchain(list.head.as_ptr()) }; } } else { - while let Some(pkt) = self.next() { + while let Some(pkt) = self.pop_front() { drop(pkt); } } @@ -3945,7 +3945,7 @@ mod test { fn create_linked_mblks(n: usize) -> Vec<*mut mblk_t> { let mut els = vec![]; - for i in 0..n { + for _ in 0..n { els.push(allocb(8)); } @@ -3962,7 +3962,7 @@ mod test { #[test] fn chain_has_correct_ends() { - let mut els = create_linked_mblks(3); + let els = create_linked_mblks(3); let chain = unsafe { PacketChain::new(els[0]) }.unwrap(); let chain_inner = chain.inner.as_ref().unwrap(); @@ -3976,7 +3976,7 @@ mod test { let mut chain = unsafe { PacketChain::new(els[0]) }.unwrap(); - let p0 = chain.next().unwrap(); + let p0 = chain.pop_front().unwrap(); assert_eq!(p0.mblk_addr(), els[0] as uintptr_t); unsafe { assert!((*els[0]).b_prev.is_null()); @@ -4024,10 +4024,10 @@ mod test { let mut chain = unsafe { PacketChain::new(els[0]) }.unwrap(); for i in 0..els.len() { - let pkt = chain.next().unwrap(); + let pkt = chain.pop_front().unwrap(); assert_eq!(pkt.mblk_addr(), els[i] as uintptr_t); } - assert!(chain.next().is_none()); + assert!(chain.pop_front().is_none()); } } diff --git a/lib/opte/src/engine/rule.rs b/lib/opte/src/engine/rule.rs index 4c9fee9a..881d7fde 100644 --- a/lib/opte/src/engine/rule.rs +++ b/lib/opte/src/engine/rule.rs @@ -834,7 +834,7 @@ impl<'a> Rule { #[cfg(debug_assertions)] { if let Some(preds) = &self.state.preds { - if preds.hdr_preds.len() == 0 && preds.data_preds.len() == 0 { + if preds.hdr_preds.is_empty() && preds.data_preds.is_empty() { panic!( "bug: RulePredicates must have at least one \ predicate" diff --git a/xde/src/lib.rs b/xde/src/lib.rs index 252d16b8..726f1ef4 100644 --- a/xde/src/lib.rs +++ b/xde/src/lib.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2022 Oxide Computer Company +// Copyright 2024 Oxide Computer Company // xde - A mac provider for OPTE-based network implementations. #![feature(extern_types)] @@ -44,6 +44,7 @@ pub mod dls; pub mod ip; pub mod mac; mod mac_sys; +pub mod route; pub mod secpolicy; pub mod sys; pub mod xde; diff --git a/xde/src/route.rs b/xde/src/route.rs new file mode 100644 index 00000000..089bc36c --- /dev/null +++ b/xde/src/route.rs @@ -0,0 +1,688 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2024 Oxide Computer Company + +use crate::ip; +use crate::sys; +use crate::xde::xde_underlay_port; +use crate::xde::DropRef; +use crate::xde::XdeDev; +use alloc::collections::btree_map::Entry; +use alloc::collections::BTreeMap; +use alloc::sync::Arc; +use core::ffi::CStr; +use core::ptr; +use core::time::Duration; +use illumos_sys_hdrs::*; +use opte::ddi::sync::KRwLock; +use opte::ddi::sync::KRwLockType; +use opte::ddi::time::Moment; +use opte::engine::ether::EtherAddr; +use opte::engine::ip6::Ipv6Addr; + +// XXX: completely arbitrary timeouts. +/// The duration a cached route remains valid for before it must be +/// refreshed. +/// +/// Expired routes will not be removed from the cache, and will leave +/// an entry to enable a quick in-place refresh in the `BTreeMap`. +const EXPIRE_ROUTE_LIFETIME: Duration = Duration::from_millis(100); + +/// The time after which a route should be completely removed from +/// the cache. +const REMOVE_ROUTE_LIFETIME: Duration = Duration::from_millis(1000); + +/// Maximum cache size, set to prevent excessive map modification latency. +const MAX_CACHE_ENTRIES: usize = 512; + +extern "C" { + pub fn __dtrace_probe_next__hop( + dst: uintptr_t, + gw: uintptr_t, + gw_ether_src: uintptr_t, + gw_ether_dst: uintptr_t, + msg: *const c_char, + ); + + pub fn __dtrace_probe_routecache__full( + cache: uintptr_t, + dst: uintptr_t, + entropy: uintptr_t, + ); + + pub fn __dtrace_probe_routecache__hit( + cache: uintptr_t, + dst: uintptr_t, + entropy: uintptr_t, + ); + + pub fn __dtrace_probe_routecache__insert( + cache: uintptr_t, + dst: uintptr_t, + entropy: uintptr_t, + ); + + pub fn __dtrace_probe_routecache__refresh( + cache: uintptr_t, + dst: uintptr_t, + entropy: uintptr_t, + ); + + pub fn __dtrace_probe_routecache__delete( + cache: uintptr_t, + dst: uintptr_t, + entropy: uintptr_t, + ); +} + +fn next_hop_probe( + dst: &Ipv6Addr, + gw: Option<&Ipv6Addr>, + gw_eth_src: EtherAddr, + gw_eth_dst: EtherAddr, + msg: &CStr, +) { + let gw_bytes = gw.unwrap_or(&Ipv6Addr::from([0u8; 16])).bytes(); + + unsafe { + __dtrace_probe_next__hop( + dst.bytes().as_ptr() as uintptr_t, + gw_bytes.as_ptr() as uintptr_t, + gw_eth_src.to_bytes().as_ptr() as uintptr_t, + gw_eth_dst.to_bytes().as_ptr() as uintptr_t, + msg.as_ptr(), + ); + } +} + +fn route_full_probe(map: uintptr_t, key: &RouteKey) { + unsafe { + __dtrace_probe_routecache__full( + map, + key.dst.as_ptr() as uintptr_t, + key.l4_hash.unwrap_or_default() as uintptr_t, + ); + } +} + +fn route_hit_probe(map: uintptr_t, key: &RouteKey) { + unsafe { + __dtrace_probe_routecache__hit( + map, + key.dst.as_ptr() as uintptr_t, + key.l4_hash.unwrap_or_default() as uintptr_t, + ); + } +} + +fn route_insert_probe(map: uintptr_t, key: &RouteKey) { + unsafe { + __dtrace_probe_routecache__insert( + map, + key.dst.as_ptr() as uintptr_t, + key.l4_hash.unwrap_or_default() as uintptr_t, + ); + } +} + +fn route_refresh_probe(map: uintptr_t, key: &RouteKey) { + unsafe { + __dtrace_probe_routecache__refresh( + map, + key.dst.as_ptr() as uintptr_t, + key.l4_hash.unwrap_or_default() as uintptr_t, + ); + } +} + +fn route_delete_probe(map: uintptr_t, key: &RouteKey) { + unsafe { + __dtrace_probe_routecache__delete( + map, + key.dst.as_ptr() as uintptr_t, + key.l4_hash.unwrap_or_default() as uintptr_t, + ); + } +} + +// The following are wrappers for reference drop functions used in XDE. + +fn ire_refrele(ire: *mut ip::ire_t) { + unsafe { ip::ire_refrele(ire) } +} + +fn nce_refrele(ire: *mut ip::nce_t) { + unsafe { ip::nce_refrele(ire) } +} + +fn netstack_rele(ns: *mut ip::netstack_t) { + unsafe { ip::netstack_rele(ns) } +} + +// At this point the core engine of OPTE has delivered a Geneve +// encapsulated guest Ethernet Frame (also simply referred to as "the +// packet") to xde to be sent to the specific outer IPv6 destination +// address. This packet includes the outer Ethernet Frame as well; +// however, the outer frame's destination and source addresses are set +// to zero. It is the job of this function to determine what those +// values should be. +// +// Adjacent to xde is the native IPv6 stack along with its routing +// table. This table is routinely updated to indicate the best path to +// any given IPv6 destination that may be specified in the outer IP +// header. As xde is not utilizing the native IPv6 stack to send out +// the packet, but rather is handing it directly to the mac module, it +// must somehow query the native routing table to determine which port +// this packet should egress and fill in the outer frame accordingly. +// This query is done via a private interface which allows a kernel +// module outside of IP to query the routing table. +// +// This process happens in a sequence of steps described below. +// +// 1. With an IPv6 destination in hand we need to determine the next +// hop, also known as the gateway, for this address. That is, of +// our neighbors (in this case one of the two switches, which are +// also acting as routers), who should we forward this packet to in +// order for it to arrive at its destination? We get this +// information from the routing table, which contains Internet +// Routing Entries, or IREs. Specifically, we query the native IPv6 +// routing table using the kernel function +// `ire_ftable_lookup_simple_v6()`. This function returns an +// `ire_t`, which includes the member `ire_u`, which contains the +// address of the gateway as `ire6_gateway_addr`. +// +// 2. We have the gateway IPv6 address; but in the world of the Oxide +// Network that is not enough to deliver the packet. In the Oxide +// Network the router (switch) is not a member of the host's +// network. Instead, we rely on link-local addresses to reach the +// switches. The lookup in step (1) gave us that link-local address +// of the gateway; now we need to figure out how to reach it. That +// requires consulting the routing table a second time: this time +// to find the IRE for the gateway's link-local address. +// +// 3. The IRE of the link-local address from step (2) allows us to +// determine which interface this traffic should traverse. +// Specifically it gives us access to the `ill_t` of the gateway's +// link-local address. This structure contains the IP Lower Level +// information. In particular it contains the `ill_phys_addr` +// which gives us the source MAC address for our outer frame. +// +// 4. The final piece of information to obtain is the destination MAC +// address. We have the link-local address of the switch port we +// want to send to. To get the MAC address of this port it must +// first be assumed that the host and its connected switches have +// performed NDP in order to learn each other's IPv6 addresses and +// corresponding MAC addresses. With that information in hand it is +// a matter of querying the kernel's Neighbor Cache Entry Table +// (NCE) for the mapping that belongs to our gateway's link-local +// address. This is done via the `nce_lookup_v6()` kernel function. +// +// With those four steps we have obtained the source and destination +// MAC addresses and the packet can be sent to mac to be delivered to +// the underlying NIC. However, the careful reader may find themselves +// confused about how step (1) actually works. +// +// If step (1) always returns a single gateway, then how do we +// actually utilize both NICs/switches? +// +// This is where a bit of knowledge about routing tables comes into +// play along with our very own Delay Driven Multipath in-rack routing +// protocol. You might imagine the IPv6 routing table on an Oxide Sled +// looking something like this. +// +// Destination/Mask Gateway Flags If +// ---------------- ------------------------- ----- --------- +// default fe80:: UG cxgbe0 +// default fe80:: UG cxgbe1 +// fe80::/10 fe80:: U cxgbe0 +// fe80::/10 fe80:: U cxgbe1 +// fd00:::/64 fe80:: U cxgbe0 +// fd00:::/64 fe80:: U cxgbe1 +// +// Let's say this host (sled1) wants to send a packet to sled2. Our +// sled1 host lives on network `fd00:::/64` while our +// sled2 host lives on `fd00:::/64` -- the key point +// being they are two different networks and thus must be routed to +// talk to each other. For sled1 to send this packet it will attempt +// to look up destination `fd00:::7777` (in this case +// `7777` is the IP of sled2) in the routing table above. The routing +// table will then perform a longest prefix match against the +// `Destination` field for all entries: the longest prefix that +// matches wins and that entry is returned. However, in this case, no +// destinations match except for the `default` ones. When more than +// one entry matches it is left to the system to decide which one to +// return; typically this just means the first one that matches. But +// not for us! This is where DDM comes into play. +// +// Let's reimagine the routing table again, this time with a +// probability added to each gateway entry. +// +// Destination/Mask Gateway Flags If P +// ---------------- ------------------------- ----- ------- ---- +// default fe80:: UG cxgbe0 0.70 +// default fe80:: UG cxgbe1 0.30 +// fe80::/10 fe80:: U cxgbe0 +// fe80::/10 fe80:: U cxgbe1 +// fd00:::/64 fe80:: U cxgbe0 +// fd00:::/64 fe80:: U cxgbe1 +// +// With these P values added we now have a new option for deciding +// which IRE to return when faced with two matches: give each a +// probability of return based on their P value. In this case, for any +// given gateway IRE lookup, there would be a 70% chance +// `fe80::` is returned and a 30% chance `fe80::` is +// returned. +// +// But wait, what determines those P values? That's the job of DDM. +// The full story of what DDM is and how it works is outside the scope +// of this already long block comment; but suffice to say it monitors +// the flow of the network based on precise latency measurements and +// with that data constantly refines the P values of all the hosts's +// routing tables to bias new packets towards one path or another. +#[no_mangle] +fn next_hop<'a>( + key: &RouteKey, + ustate: &'a XdeDev, +) -> Result, &'a xde_underlay_port> { + let RouteKey { dst: ip6_dst, l4_hash } = key; + unsafe { + // Use the GZ's routing table. + let netstack = + DropRef::new(netstack_rele, ip::netstack_find_by_zoneid(0)); + assert!(!netstack.inner().is_null()); + let ipst = (*netstack.inner()).netstack_u.nu_s.nu_ip; + assert!(!ipst.is_null()); + + let addr = ip::in6_addr_t { + _S6_un: ip::in6_addr__bindgen_ty_1 { _S6_u8: key.dst.bytes() }, + }; + let xmit_hint = l4_hash.unwrap_or(0); + let mut generation_op = 0u32; + + let mut underlay_dev = &*ustate.u1; + + // Step (1): Lookup the IRE for the destination. This is going + // to return one of the default gateway entries. + let ire = DropRef::new( + ire_refrele, + ip::ire_ftable_lookup_v6( + &addr, + ptr::null(), + ptr::null(), + 0, + ptr::null_mut(), + sys::ALL_ZONES, + ptr::null(), + 0, + xmit_hint, + ipst, + &mut generation_op as *mut ip::uint_t, + ), + ); + + // TODO If there is no entry should we return host + // unreachable? I'm not sure since really the guest would map + // that with its VPC network. That is, if a user saw host + // unreachable they would be correct to think that their VPC + // routing table is misconfigured, but in reality it would be + // an underlay network issue. How do we convey this situation + // to the user/operator? + if ire.inner().is_null() { + // Try without a pinned ill + opte::engine::dbg!("no IRE for destination {:?}", ip6_dst); + next_hop_probe( + ip6_dst, + None, + EtherAddr::zero(), + EtherAddr::zero(), + c"no IRE for destination", + ); + return Err(underlay_dev); + } + let ill = (*ire.inner()).ire_ill; + if ill.is_null() { + opte::engine::dbg!("destination ILL is NULL for {:?}", ip6_dst); + next_hop_probe( + ip6_dst, + None, + EtherAddr::zero(), + EtherAddr::zero(), + c"destination ILL is NULL", + ); + return Err(underlay_dev); + } + + // Step (2): Lookup the IRE for the gateway's link-local + // address. This is going to return one of the `fe80::/10` + // entries. + let ireu = (*ire.inner()).ire_u; + let gw = ireu.ire6_u.ire6_gateway_addr; + let gw_ip6 = Ipv6Addr::from(&ireu.ire6_u.ire6_gateway_addr); + + // NOTE: specifying the ill is important here, because the gateway + // address is going to be of the form fe80::. This means a + // simple query that does not specify an ill could come back with any + // route matching fe80::/10 over any interface. Since all interfaces + // that have an IPv6 link-local address assigned have an associated + // fe80::/10 route, we must restrict our search to the interface that + // actually has a route to the desired (non-link-local) destination. + let flags = ip::MATCH_IRE_ILL as i32; + let gw_ire = DropRef::new( + ire_refrele, + ip::ire_ftable_lookup_v6( + &gw, + ptr::null(), + ptr::null(), + 0, + ill, + sys::ALL_ZONES, + ptr::null(), + flags, + xmit_hint, + ipst, + &mut generation_op as *mut ip::uint_t, + ), + ); + + if gw_ire.inner().is_null() { + opte::engine::dbg!("no IRE for gateway {:?}", gw_ip6); + next_hop_probe( + ip6_dst, + Some(&gw_ip6), + EtherAddr::zero(), + EtherAddr::zero(), + c"no IRE for gateway", + ); + return Err(underlay_dev); + } + + // Step (3): Determine the source address of the outer frame + // from the physical address of the IP Lower Layer object + // member or the internet routing entry. + let src = (*ill).ill_phys_addr; + if src.is_null() { + opte::engine::dbg!( + "gateway ILL phys addr is NULL for {:?}", + gw_ip6 + ); + next_hop_probe( + ip6_dst, + Some(&gw_ip6), + EtherAddr::zero(), + EtherAddr::zero(), + c"gateway ILL phys addr is NULL", + ); + return Err(underlay_dev); + } + + let src: [u8; 6] = alloc::slice::from_raw_parts(src, 6) + .try_into() + .expect("src mac from pointer"); + + // Switch to the 2nd underlay device if we determine the source mac + // belongs to that device. + if src == ustate.u2.mac { + underlay_dev = &ustate.u2; + } + + let src = EtherAddr::from(src); + + // Step (4): Determine the destination address of the outer + // frame by retrieving the NCE entry for the gateway's + // link-local address. + let nce = DropRef::new(nce_refrele, ip::nce_lookup_v6(ill, &gw)); + if nce.inner().is_null() { + opte::engine::dbg!("no NCE for gateway {:?}", gw_ip6); + next_hop_probe( + ip6_dst, + Some(&gw_ip6), + src, + EtherAddr::zero(), + c"no NCE for gateway", + ); + return Err(underlay_dev); + } + + let nce_common = (*nce.inner()).nce_common; + if nce_common.is_null() { + opte::engine::dbg!("no NCE common for gateway {:?}", gw_ip6); + next_hop_probe( + ip6_dst, + Some(&gw_ip6), + src, + EtherAddr::zero(), + c"no NCE common for gateway", + ); + return Err(underlay_dev); + } + + let mac = (*nce_common).ncec_lladdr; + if mac.is_null() { + opte::engine::dbg!("NCE MAC address is NULL {:?}", gw_ip6); + next_hop_probe( + ip6_dst, + Some(&gw_ip6), + src, + EtherAddr::zero(), + c"NCE MAC address if NULL for gateway", + ); + return Err(underlay_dev); + } + + let maclen = (*nce_common).ncec_lladdr_length; + assert!(maclen == 6); + + let dst: [u8; 6] = alloc::slice::from_raw_parts(mac, 6) + .try_into() + .expect("mac from pointer"); + let dst = EtherAddr::from(dst); + + next_hop_probe(ip6_dst, Some(&gw_ip6), src, dst, c""); + + Ok(Route { src, dst, underlay_dev }) + } +} + +/// A simple caching layer over `next_hop`. +/// +/// [`next_hop`] has a latency distribution which roughly looks like this: +/// ```text +/// t(ns) Count +/// 1024 | 337 +/// 1280 | 108 +/// 1536 |@@@@@@@@@@@@@@@@@@@@@ 376883 +/// 1792 |@@@@@@@@@@@@@@@ 264693 +/// 2048 |@ 17798 +/// 2304 |@ 14791 +/// 2560 |@@ 32901 +/// 2816 |@ 10730 +/// 3072 | 3459 +/// ``` +/// +/// Naturally, bringing this down to O(ns) is desirable. Usually, illumos +/// holds `ire_t`s per `conn_t`, but we're aiming to be more fine-grained +/// with DDM -- so we need a tradeoff between 'asking about the best route +/// per-packet' and 'holding a route until it is expired'. We choose, for now, +/// to hold a route for 100ms. +/// +/// Note, this uses a `BTreeMap`, but we would prefer the more consistent +/// (faster) add/remove costs of a `HashMap`. As `BTreeMap` modification costs +/// outpace the cost of `next_hop` between 256--512 entries, we currently set 512 +/// as a cap on cache size to prevent significant packet stalls. This may be tricky +/// to tune. +/// +/// (See: https://github.com/oxidecomputer/opte/pull/499#discussion_r1581164767 +/// for some performance numbers.) +#[derive(Clone)] +pub struct RouteCache(Arc>>); + +impl Default for RouteCache { + fn default() -> Self { + let mut lock = KRwLock::new(BTreeMap::new()); + lock.init(KRwLockType::Driver); + Self(lock.into()) + } +} + +impl RouteCache { + /// Retrieve a [`Route`] (device and L2 information) for a given `key`. + /// + /// This will retrieve an existing entry, if one exists from a recent + /// query, or computes the current route using `next_hop` on miss or + /// discovery of a stale entry. + pub fn next_hop<'b>(&self, key: RouteKey, xde: &'b XdeDev) -> Route<'b> { + let t = Moment::now(); + + let (maybe_route, map_ptr_int) = { + let route_cache = self.0.read(); + ( + route_cache.get(&key).copied(), + &*route_cache as *const BTreeMap<_, _> as uintptr_t, + ) + }; + + match maybe_route { + Some(route) if route.is_valid(t) => { + route_hit_probe(map_ptr_int, &key); + return route.into_route(xde); + } + _ => {} + } + + // Cache miss: intent is to now ask illumos, then insert. + let mut route_cache = self.0.write(); + let space_remaining = route_cache.len() < MAX_CACHE_ENTRIES; + + // Someone else may have written while we were taking the lock. + // DO NOT waste time if there's a good route. + let maybe_route = route_cache.entry(key); + let entry_exists = match &maybe_route { + Entry::Occupied(e) if e.get().is_valid(t) => { + route_hit_probe(map_ptr_int, &key); + return e.get().into_route(xde); + } + Entry::Occupied(_) => true, + _ => false, + }; + + // We've had a definitive flow miss, but we need to cap the cache + // size to prevent excessive modification latencies at high flow + // counts. + // If full and we have no old entry to update, drop the lock and do + // not insert. + // XXX: Want to profile in future to see if LRU expiry is + // affordable/sane here. + // XXX: A HashMap would exchange insert cost for lookup. + if entry_exists || space_remaining { + // `next_hop` might fail for myriad reasons, but we still + // send the packet on an underlay device depending on our + // progress. However, we do not want to cache bad mappings. + match (maybe_route, next_hop(&key, xde)) { + (Entry::Vacant(slot), Ok(route)) => { + route_insert_probe(map_ptr_int, &key); + slot.insert(route.cached(xde, t)); + route + } + (Entry::Occupied(mut slot), Ok(route)) => { + route_refresh_probe(map_ptr_int, &key); + slot.insert(route.cached(xde, t)); + route + } + (_, Err(dev)) => Route::zero_addr(dev), + } + } else { + route_full_probe(map_ptr_int, &key); + drop(route_cache); + match next_hop(&key, xde) { + Ok(route) => route, + Err(dev) => Route::zero_addr(dev), + } + } + } + + /// Discards any cached route entries which have been present + /// for longer than `REMOVE_ROUTE_LIFETIME`. + pub fn remove_routes(&self) { + let mut route_cache = self.0.write(); + + let t = Moment::now(); + let ptr: *const BTreeMap<_, _> = &*route_cache; + let map_ptr_int = ptr as uintptr_t; + + route_cache.retain(|k, v| { + if v.is_retained(t) { + true + } else { + route_delete_probe(map_ptr_int, k); + false + } + }); + } +} + +/// An underlay routing destination and flow-dependent entropy. +#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] +pub struct RouteKey { + pub dst: Ipv6Addr, + pub l4_hash: Option, +} + +/// Cached representation of [`Route`]. +#[derive(Copy, Clone, Debug)] +pub struct CachedRoute { + pub src: EtherAddr, + pub dst: EtherAddr, + pub underlay_idx: u8, + pub timestamp: Moment, +} + +impl CachedRoute { + fn is_valid(&self, t: Moment) -> bool { + u128::from(t.delta_as_millis(self.timestamp)) + <= EXPIRE_ROUTE_LIFETIME.as_millis() + } + + fn is_retained(&self, t: Moment) -> bool { + u128::from(t.delta_as_millis(self.timestamp)) + <= REMOVE_ROUTE_LIFETIME.as_millis() + } + + fn into_route(self, xde: &XdeDev) -> Route<'_> { + Route { + src: self.src, + dst: self.dst, + // This is not a pretty construction, and will not work for + // a hypothetically higher port count. + underlay_dev: if self.underlay_idx == 0 { + &xde.u1 + } else { + &xde.u2 + }, + } + } +} + +/// Output port and L2 information needed to emit a packet over the underlay. +#[derive(Copy, Clone, Debug)] +pub struct Route<'a> { + pub src: EtherAddr, + pub dst: EtherAddr, + pub underlay_dev: &'a xde_underlay_port, +} + +impl<'a> Route<'a> { + fn cached(&self, xde: &XdeDev, timestamp: Moment) -> CachedRoute { + // As unfortunate as `into_route`. + let port_0: &xde_underlay_port = &xde.u1; + let underlay_idx = + if core::ptr::eq(self.underlay_dev, port_0) { 0 } else { 1 }; + + CachedRoute { src: self.src, dst: self.dst, underlay_idx, timestamp } + } + + fn zero_addr(underlay_dev: &'a xde_underlay_port) -> Route<'a> { + Self { src: EtherAddr::zero(), dst: EtherAddr::zero(), underlay_dev } + } +} diff --git a/xde/src/xde.rs b/xde/src/xde.rs index de9dfa11..2199fa8f 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -14,7 +14,6 @@ use crate::dls; use crate::ioctl::IoctlEnvelope; -use crate::ip; use crate::mac; use crate::mac::mac_getinfo; use crate::mac::mac_private_minor; @@ -24,6 +23,9 @@ use crate::mac::MacOpenFlags; use crate::mac::MacPromiscHandle; use crate::mac::MacTxFlags; use crate::mac::MacUnicastHandle; +use crate::route::Route; +use crate::route::RouteCache; +use crate::route::RouteKey; use crate::secpolicy; use crate::sys; use crate::warn; @@ -141,13 +143,6 @@ extern "C" { dst_port: uintptr_t, ); pub fn __dtrace_probe_hdlr__resp(resp_str: uintptr_t); - pub fn __dtrace_probe_next__hop( - dst: uintptr_t, - gw: uintptr_t, - gw_ether_src: uintptr_t, - gw_ether_dst: uintptr_t, - msg: *const c_char, - ); pub fn __dtrace_probe_rx(mp: uintptr_t); pub fn __dtrace_probe_tx(mp: uintptr_t); } @@ -204,34 +199,14 @@ fn bad_packet_probe( }; } -fn next_hop_probe( - dst: &Ipv6Addr, - gw: Option<&Ipv6Addr>, - gw_eth_src: EtherAddr, - gw_eth_dst: EtherAddr, - msg: &CStr, -) { - let gw_bytes = gw.unwrap_or(&Ipv6Addr::from([0u8; 16])).bytes(); - - unsafe { - __dtrace_probe_next__hop( - dst.bytes().as_ptr() as uintptr_t, - gw_bytes.as_ptr() as uintptr_t, - gw_eth_src.to_bytes().as_ptr() as uintptr_t, - gw_eth_dst.to_bytes().as_ptr() as uintptr_t, - msg.as_ptr(), - ); - } -} - /// Underlay port state. #[derive(Debug)] -struct xde_underlay_port { +pub struct xde_underlay_port { /// Name of the link being used for this underlay port. - name: String, + pub name: String, /// The MAC address associated with this underlay port. - mac: [u8; 6], + pub mac: [u8; 6], /// MAC handle to the underlay link. mh: Arc, @@ -283,7 +258,7 @@ impl XdeState { } #[repr(C)] -struct XdeDev { +pub struct XdeDev { devname: String, linkid: datalink_id_t, mh: *mut mac::mac_handle, @@ -307,8 +282,14 @@ struct XdeDev { // These are clones of the underlay ports initialized by the // driver. - u1: Arc, - u2: Arc, + pub u1: Arc, + pub u2: Arc, + + // We make this a per-port cache rather than sharing between all + // ports to theoretically reduce contention around route expiry + // and reinsertion. + routes: RouteCache, + routes_periodic: Periodic, } #[cfg(not(test))] @@ -630,6 +611,11 @@ fn expire_periodic(port: &mut Arc>) { let _ = port.expire_flows(); } +#[no_mangle] +fn expire_route_cache(routes: &mut RouteCache) { + routes.remove_routes() +} + #[no_mangle] fn create_xde(req: &CreateXdeReq) -> Result { // TODO name validation @@ -706,6 +692,15 @@ fn create_xde(req: &CreateXdeReq) -> Result { ONE_SECOND, ); + let routes = RouteCache::default(); + + let routes_periodic = Periodic::new( + port.name_cstr().clone(), + expire_route_cache, + Box::new(routes.clone()), + ONE_SECOND, + ); + let mut xde = Box::new(XdeDev { devname: req.xde_devname.clone(), linkid: req.linkid, @@ -719,6 +714,8 @@ fn create_xde(req: &CreateXdeReq) -> Result { passthrough: req.passthrough, u1: underlay.u1.clone(), u2: underlay.u2.clone(), + routes, + routes_periodic, }); drop(underlay_); @@ -1526,7 +1523,7 @@ unsafe extern "C" fn xde_mc_tx( // by the mch they're being targeted to. E.g., either build a list // of chains (u1, u2, port0, port1, ...), or hold tx until another // packet breaks the run targeting the same dest. - while let Some(pkt) = chain.next() { + while let Some(pkt) = chain.pop_front() { xde_mc_tx_one(src_dev, pkt); } @@ -1628,8 +1625,14 @@ unsafe fn xde_mc_tx_one( // for the mac associated with the IRE nexthop to fill in // the outer frame of the packet. Also return the underlay // device associated with the nexthop - let (src, dst, underlay_dev) = - next_hop(&ip6.dst, src_dev, meta.l4_hash()); + // + // As route lookups are fairly expensive, we can cache their + // results for a given dst + entropy. These have a fairly tight + // expiry so that we can actually react to new reachability/load + // info from DDM. + let my_key = RouteKey { dst: ip6.dst, l4_hash: meta.l4_hash() }; + let Route { src, dst, underlay_dev } = + src_dev.routes.next_hop(my_key, src_dev); // Get a pointer to the beginning of the outer frame and // fill in the dst/src addresses before sending out the @@ -1670,7 +1673,7 @@ unsafe fn xde_mc_tx_one( /// This is a generic wrapper for references that should be dropped once not in /// use. -struct DropRef +pub(crate) struct DropRef where DropFn: Fn(*mut Arg), { @@ -1686,12 +1689,12 @@ where { /// Create a new `DropRef` for the provided reference argument. When this /// object is dropped, the provided `func` will be called. - fn new(func: DropFn, arg: *mut Arg) -> Self { + pub(crate) fn new(func: DropFn, arg: *mut Arg) -> Self { Self { func, arg } } /// Return a pointer to the underlying reference. - fn inner(&self) -> *mut Arg { + pub(crate) fn inner(&self) -> *mut Arg { self.arg } } @@ -1708,344 +1711,6 @@ where } } -// The following are wrappers for reference drop functions used in XDE. - -fn ire_refrele(ire: *mut ip::ire_t) { - unsafe { ip::ire_refrele(ire) } -} - -fn nce_refrele(ire: *mut ip::nce_t) { - unsafe { ip::nce_refrele(ire) } -} - -fn netstack_rele(ns: *mut ip::netstack_t) { - unsafe { ip::netstack_rele(ns) } -} - -// At this point the core engine of OPTE has delivered a Geneve -// encapsulated guest Ethernet Frame (also simply referred to as "the -// packet") to xde to be sent to the specific outer IPv6 destination -// address. This packet includes the outer Ethernet Frame as well; -// however, the outer frame's destination and source addresses are set -// to zero. It is the job of this function to determine what those -// values should be. -// -// Adjacent to xde is the native IPv6 stack along with its routing -// table. This table is routinely updated to indicate the best path to -// any given IPv6 destination that may be specified in the outer IP -// header. As xde is not utilizing the native IPv6 stack to send out -// the packet, but rather is handing it directly to the mac module, it -// must somehow query the native routing table to determine which port -// this packet should egress and fill in the outer frame accordingly. -// This query is done via a private interface which allows a kernel -// module outside of IP to query the routing table. -// -// This process happens in a sequence of steps described below. -// -// 1. With an IPv6 destination in hand we need to determine the next -// hop, also known as the gateway, for this address. That is, of -// our neighbors (in this case one of the two switches, which are -// also acting as routers), who should we forward this packet to in -// order for it to arrive at its destination? We get this -// information from the routing table, which contains Internet -// Routing Entries, or IREs. Specifically, we query the native IPv6 -// routing table using the kernel function -// `ire_ftable_lookup_simple_v6()`. This function returns an -// `ire_t`, which includes the member `ire_u`, which contains the -// address of the gateway as `ire6_gateway_addr`. -// -// 2. We have the gateway IPv6 address; but in the world of the Oxide -// Network that is not enough to deliver the packet. In the Oxide -// Network the router (switch) is not a member of the host's -// network. Instead, we rely on link-local addresses to reach the -// switches. The lookup in step (1) gave us that link-local address -// of the gateway; now we need to figure out how to reach it. That -// requires consulting the routing table a second time: this time -// to find the IRE for the gateway's link-local address. -// -// 3. The IRE of the link-local address from step (2) allows us to -// determine which interface this traffic should traverse. -// Specifically it gives us access to the `ill_t` of the gateway's -// link-local address. This structure contains the IP Lower Level -// information. In particular it contains the `ill_phys_addr` -// which gives us the source MAC address for our outer frame. -// -// 4. The final piece of information to obtain is the destination MAC -// address. We have the link-local address of the switch port we -// want to send to. To get the MAC address of this port it must -// first be assumed that the host and its connected switches have -// performed NDP in order to learn each other's IPv6 addresses and -// corresponding MAC addresses. With that information in hand it is -// a matter of querying the kernel's Neighbor Cache Entry Table -// (NCE) for the mapping that belongs to our gateway's link-local -// address. This is done via the `nce_lookup_v6()` kernel function. -// -// With those four steps we have obtained the source and destination -// MAC addresses and the packet can be sent to mac to be delivered to -// the underlying NIC. However, the careful reader may find themselves -// confused about how step (1) actually works. -// -// If step (1) always returns a single gateway, then how do we -// actually utilize both NICs/switches? -// -// This is where a bit of knowledge about routing tables comes into -// play along with our very own Delay Driven Multipath in-rack routing -// protocol. You might imagine the IPv6 routing table on an Oxide Sled -// looking something like this. -// -// Destination/Mask Gateway Flags If -// ---------------- ------------------------- ----- --------- -// default fe80:: UG cxgbe0 -// default fe80:: UG cxgbe1 -// fe80::/10 fe80:: U cxgbe0 -// fe80::/10 fe80:: U cxgbe1 -// fd00:::/64 fe80:: U cxgbe0 -// fd00:::/64 fe80:: U cxgbe1 -// -// Let's say this host (sled1) wants to send a packet to sled2. Our -// sled1 host lives on network `fd00:::/64` while our -// sled2 host lives on `fd00:::/64` -- the key point -// being they are two different networks and thus must be routed to -// talk to each other. For sled1 to send this packet it will attempt -// to look up destination `fd00:::7777` (in this case -// `7777` is the IP of sled2) in the routing table above. The routing -// table will then perform a longest prefix match against the -// `Destination` field for all entries: the longest prefix that -// matches wins and that entry is returned. However, in this case, no -// destinations match except for the `default` ones. When more than -// one entry matches it is left to the system to decide which one to -// return; typically this just means the first one that matches. But -// not for us! This is where DDM comes into play. -// -// Let's reimagine the routing table again, this time with a -// probability added to each gateway entry. -// -// Destination/Mask Gateway Flags If P -// ---------------- ------------------------- ----- ------- ---- -// default fe80:: UG cxgbe0 0.70 -// default fe80:: UG cxgbe1 0.30 -// fe80::/10 fe80:: U cxgbe0 -// fe80::/10 fe80:: U cxgbe1 -// fd00:::/64 fe80:: U cxgbe0 -// fd00:::/64 fe80:: U cxgbe1 -// -// With these P values added we now have a new option for deciding -// which IRE to return when faced with two matches: give each a -// probability of return based on their P value. In this case, for any -// given gateway IRE lookup, there would be a 70% chance -// `fe80::` is returned and a 30% chance `fe80::` is -// returned. -// -// But wait, what determines those P values? That's the job of DDM. -// The full story of what DDM is and how it works is outside the scope -// of this already long block comment; but suffice to say it monitors -// the flow of the network based on precise latency measurements and -// with that data constantly refines the P values of all the hosts's -// routing tables to bias new packets towards one path or another. -#[no_mangle] -fn next_hop<'a>( - ip6_dst: &Ipv6Addr, - ustate: &'a XdeDev, - overlay_hash: Option, -) -> (EtherAddr, EtherAddr, &'a xde_underlay_port) { - unsafe { - // Use the GZ's routing table. - let netstack = - DropRef::new(netstack_rele, ip::netstack_find_by_zoneid(0)); - assert!(!netstack.inner().is_null()); - let ipst = (*netstack.inner()).netstack_u.nu_s.nu_ip; - assert!(!ipst.is_null()); - - let addr = ip::in6_addr_t { - _S6_un: ip::in6_addr__bindgen_ty_1 { _S6_u8: ip6_dst.bytes() }, - }; - let xmit_hint = overlay_hash.unwrap_or(0); - let mut generation_op = 0u32; - - let mut underlay_port = &*ustate.u1; - - // Step (1): Lookup the IRE for the destination. This is going - // to return one of the default gateway entries. - let ire = DropRef::new( - ire_refrele, - ip::ire_ftable_lookup_v6( - &addr, - ptr::null(), - ptr::null(), - 0, - ptr::null_mut(), - sys::ALL_ZONES, - ptr::null(), - 0, - xmit_hint, - ipst, - &mut generation_op as *mut ip::uint_t, - ), - ); - - // TODO If there is no entry should we return host - // unreachable? I'm not sure since really the guest would map - // that with its VPC network. That is, if a user saw host - // unreachable they would be correct to think that their VPC - // routing table is misconfigured, but in reality it would be - // an underlay network issue. How do we convey this situation - // to the user/operator? - if ire.inner().is_null() { - // Try without a pinned ill - opte::engine::dbg!("no IRE for destination {:?}", ip6_dst); - next_hop_probe( - ip6_dst, - None, - EtherAddr::zero(), - EtherAddr::zero(), - c"no IRE for destination", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - let ill = (*ire.inner()).ire_ill; - if ill.is_null() { - opte::engine::dbg!("destination ILL is NULL for {:?}", ip6_dst); - next_hop_probe( - ip6_dst, - None, - EtherAddr::zero(), - EtherAddr::zero(), - c"destination ILL is NULL", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - - // Step (2): Lookup the IRE for the gateway's link-local - // address. This is going to return one of the `fe80::/10` - // entries. - let ireu = (*ire.inner()).ire_u; - let gw = ireu.ire6_u.ire6_gateway_addr; - let gw_ip6 = Ipv6Addr::from(&ireu.ire6_u.ire6_gateway_addr); - - // NOTE: specifying the ill is important here, because the gateway - // address is going to be of the form fe80::. This means a - // simple query that does not specify an ill could come back with any - // route matching fe80::/10 over any interface. Since all interfaces - // that have an IPv6 link-local address assigned have an associated - // fe80::/10 route, we must restrict our search to the interface that - // actually has a route to the desired (non-link-local) destination. - let flags = ip::MATCH_IRE_ILL as i32; - let gw_ire = DropRef::new( - ire_refrele, - ip::ire_ftable_lookup_v6( - &gw, - ptr::null(), - ptr::null(), - 0, - ill, - sys::ALL_ZONES, - ptr::null(), - flags, - xmit_hint, - ipst, - &mut generation_op as *mut ip::uint_t, - ), - ); - - if gw_ire.inner().is_null() { - opte::engine::dbg!("no IRE for gateway {:?}", gw_ip6); - next_hop_probe( - ip6_dst, - Some(&gw_ip6), - EtherAddr::zero(), - EtherAddr::zero(), - c"no IRE for gateway", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - - // Step (3): Determine the source address of the outer frame - // from the physical address of the IP Lower Layer object - // member or the internet routing entry. - let src = (*ill).ill_phys_addr; - if src.is_null() { - opte::engine::dbg!( - "gateway ILL phys addr is NULL for {:?}", - gw_ip6 - ); - next_hop_probe( - ip6_dst, - Some(&gw_ip6), - EtherAddr::zero(), - EtherAddr::zero(), - c"gateway ILL phys addr is NULL", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - - let src: [u8; 6] = alloc::slice::from_raw_parts(src, 6) - .try_into() - .expect("src mac from pointer"); - - // Switch to the 2nd underlay device if we determine the source mac - // belongs to that device. - if src == ustate.u2.mac { - underlay_port = &ustate.u2; - } - - let src = EtherAddr::from(src); - - // Step (4): Determine the destination address of the outer - // frame by retrieving the NCE entry for the gateway's - // link-local address. - let nce = DropRef::new(nce_refrele, ip::nce_lookup_v6(ill, &gw)); - if nce.inner().is_null() { - opte::engine::dbg!("no NCE for gateway {:?}", gw_ip6); - next_hop_probe( - ip6_dst, - Some(&gw_ip6), - src, - EtherAddr::zero(), - c"no NCE for gateway", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - - let nce_common = (*nce.inner()).nce_common; - if nce_common.is_null() { - opte::engine::dbg!("no NCE common for gateway {:?}", gw_ip6); - next_hop_probe( - ip6_dst, - Some(&gw_ip6), - src, - EtherAddr::zero(), - c"no NCE common for gateway", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - - let mac = (*nce_common).ncec_lladdr; - if mac.is_null() { - opte::engine::dbg!("NCE MAC address is NULL {:?}", gw_ip6); - next_hop_probe( - ip6_dst, - Some(&gw_ip6), - src, - EtherAddr::zero(), - c"NCE MAC address if NULL for gateway", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - - let maclen = (*nce_common).ncec_lladdr_length; - assert!(maclen == 6); - - let dst: [u8; 6] = alloc::slice::from_raw_parts(mac, 6) - .try_into() - .expect("mac from pointer"); - let dst = EtherAddr::from(dst); - - next_hop_probe(ip6_dst, Some(&gw_ip6), src, dst, c""); - - (src, dst, underlay_port) - } -} - #[no_mangle] unsafe extern "C" fn xde_mc_getcapab( _arg: *mut c_void, @@ -2160,7 +1825,7 @@ unsafe extern "C" fn xde_rx( // by the mch they're being targeted to. E.g., either build a list // of chains (port0, port1, ...), or hold tx until another // packet breaks the run targeting the same dest. - while let Some(pkt) = chain.next() { + while let Some(pkt) = chain.pop_front() { xde_rx_one(&mch, mrh, pkt); } }