Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 29 additions & 12 deletions src/commands/dhcp_proxy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ use tonic::{
transport::Server, Code, Code::Internal, Code::InvalidArgument, Request, Response, Status,
};

type TaskData = (Arc<tokio::sync::Mutex<DhcpV4Service>>, AbortHandle);

#[derive(Debug)]
/// This is the tonic netavark proxy service that is required to impl the Netavark Proxy trait which
/// includes the gRPC methods defined in proto/proxy.proto. We can store a atomically referenced counted
Expand All @@ -59,7 +61,7 @@ struct NetavarkProxyService<W: Write + Clear> {
timeout_sender: Option<Arc<Mutex<Sender<i32>>>>,
// All dhcp poll will be spawned on a new task, keep track of it so
// we can remove it on teardown. The key is the container mac.
task_map: Arc<Mutex<HashMap<String, AbortHandle>>>,
task_map: Arc<Mutex<HashMap<String, TaskData>>>,
}

impl<W: Write + Clear> NetavarkProxyService<W> {
Expand Down Expand Up @@ -136,8 +138,8 @@ impl<W: Write + Clear + Send + 'static> NetavarkProxy for NetavarkProxyService<W
};
}

/// When a container is shut down this method should be called. It will clear the lease information
/// from the caching system.
/// When a container is shut down this method should be called. It will release the
/// DHCP lease and clear the lease information from the caching system.
async fn teardown(
&self,
request: Request<NetworkConfig>,
Expand All @@ -149,12 +151,25 @@ impl<W: Write + Clear + Send + 'static> NetavarkProxy for NetavarkProxyService<W
let cache = self.cache.clone();
let tasks = self.task_map.clone();

let task = tasks
.lock()
.expect("lock tasks")
.remove(&nc.container_mac_addr);
if let Some(handle) = task {
handle.abort();
let maybe_service_arc = {
// Scope for the std::sync::MutexGuard
let mut tasks_guard = tasks.lock().expect("lock tasks");

if let Some((service_arc, handle)) = tasks_guard.remove(&nc.container_mac_addr) {
handle.abort();
Some(service_arc)
} else {
None
}
};
if let Some(service_arc) = maybe_service_arc {
let mut service = service_arc.lock().await;
if let Err(e) = service.release_lease() {
warn!(
"Failed to send DHCPRELEASE for {}: {}",
&nc.container_mac_addr, e
);
}
}

// Remove the client from the cache dir
Expand Down Expand Up @@ -406,7 +421,7 @@ async fn process_setup<W: Write + Clear>(
network_config: NetworkConfig,
timeout: u32,
cache: Arc<Mutex<LeaseCache<W>>>,
tasks: Arc<Mutex<HashMap<String, AbortHandle>>>,
tasks: Arc<Mutex<HashMap<String, TaskData>>>,
) -> Result<NetavarkLease, Status> {
let container_network_interface = network_config.container_iface.clone();
let ns_path = network_config.ns_path.clone();
Expand All @@ -422,11 +437,13 @@ async fn process_setup<W: Write + Clear>(
let mut service = DhcpV4Service::new(network_config, timeout)?;

let lease = service.get_lease().await?;
let task = tokio::spawn(process_client_stream(service));
let service_arc = Arc::new(tokio::sync::Mutex::new(service));
let service_arc_clone = service_arc.clone();
let task_handle = tokio::spawn(process_client_stream(service_arc_clone));
tasks
.lock()
.expect("lock tasks")
.insert(mac.to_string(), task.abort_handle());
.insert(mac.to_string(), (service_arc, task_handle.abort_handle()));
lease
}
//V6 TODO implement DHCPv6
Expand Down
99 changes: 63 additions & 36 deletions src/dhcp_proxy/dhcp_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ use crate::network::netlink::Route;
use crate::wrap;
use log::debug;
use mozim::{DhcpV4ClientAsync, DhcpV4Config, DhcpV4Lease as MozimV4Lease};
use std::sync::Arc;
use tokio::sync::Mutex;
use tokio_stream::StreamExt;

use tonic::{Code, Status};

/// The kind of DhcpServiceError that can be caused when finding a dhcp lease
Expand All @@ -39,6 +40,7 @@ impl DhcpServiceError {
}

/// DHCP service is responsible for creating, handling, and managing the dhcp lease process.
#[derive(Debug)]
pub struct DhcpV4Service {
client: DhcpV4ClientAsync,
network_config: NetworkConfig,
Expand Down Expand Up @@ -129,6 +131,28 @@ impl DhcpV4Service {
"Could not find a lease within the timeout limit".to_string(),
))
}

/// Sends a DHCPRELEASE message for the given lease.
/// This is a "best effort" operation and should not block teardown.
pub fn release_lease(&mut self) -> Result<(), DhcpServiceError> {
if let Some(lease) = &self.previous_lease {
debug!(
"Attempting to release lease for MAC: {}",
&self.network_config.container_mac_addr
);
// Directly call the release function on the underlying mozim client.
self.client
.release(lease)
.map_err(|e| DhcpServiceError::new(Bug, e.to_string()))
} else {
// No previous lease recorded; nothing to release. Best-effort -> succeed silently.
debug!(
"No previous lease to release for MAC: {}",
&self.network_config.container_mac_addr
);
Ok(())
}
}
}

impl std::fmt::Display for DhcpServiceError {
Expand All @@ -149,50 +173,53 @@ impl From<DhcpServiceError> for Status {
}
}

pub async fn process_client_stream(mut client: DhcpV4Service) {
while let Some(lease) = client.client.next().await {
match lease {
Ok(lease) => {
log::info!(
"got new lease for mac {}: {:?}",
&client.network_config.container_mac_addr,
&lease
);
// get previous lease and check if ip addr changed, if not we do not have to do anything
if let Some(old_lease) = &client.previous_lease {
if old_lease.yiaddr != lease.yiaddr
|| old_lease.subnet_mask != lease.subnet_mask
|| old_lease.gateways != lease.gateways
{
// ips do not match, remove old ones and assign new ones.
log::info!(
"ip or gateway for mac {} changed, update address",
&client.network_config.container_mac_addr
);
match update_lease_ip(
&client.network_config.ns_path,
&client.network_config.container_iface,
old_lease,
&lease,
) {
Ok(_) => {}
Err(err) => {
pub async fn process_client_stream(service_arc: Arc<Mutex<DhcpV4Service>>) {
loop {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could this not just stay as loop like this

while let Some(lease) = service_arc.lock().await.client.next().await {
...
}

The extra scopes to reduce lock holding is interesting but not actually doing anything since this function is the only one holding the lock anyway.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry @Luap99, but this was done to prevent a potential deadlock scenario.

If we use:

while let Some(lease) = service_arc.lock().await.client.next().await {
...
}

we end up holding the lock on service_arc for the entire loop. Later, inside the Ok arm of the match lease when we need to access the client inside DhcpV4Service, we again need to acquire the lock on service_arc to access the client:

match lease {
            Ok(lease) => {
                let mut client = service_arc.lock().await;
                log::info!(
                    "got new lease for mac {}: {:?}",
                    &client.network_config.container_mac_addr,
                    &lease
                );
                // rest of the code
}

This would result in a deadlock.

To avoid this, I restructured it into a loop with equivalent stopping conditions using an if/else and break. This way, the lock is released after acquiring the lease and then reacquired only when needed to access the client.

let lease_result = service_arc.lock().await.client.next().await;

if let Some(lease_result) = lease_result {
let mut client = service_arc.lock().await;
match lease_result {
Ok(lease) => {
log::info!(
"got new lease for mac {}: {:?}",
&client.network_config.container_mac_addr,
&lease
);

if let Some(old_lease) = &client.previous_lease {
if old_lease.yiaddr != lease.yiaddr
|| old_lease.subnet_mask != lease.subnet_mask
|| old_lease.gateways != lease.gateways
{
log::info!(
"ip or gateway for mac {} changed, update address",
&client.network_config.container_mac_addr
);
if let Err(err) = update_lease_ip(
&client.network_config.ns_path,
&client.network_config.container_iface,
old_lease,
&lease,
) {
log::error!("{err}");
continue;
}
}
}
client.previous_lease = Some(lease);
}
Err(err) => {
log::error!(
"Failed to renew lease for {}: {err}",
&client.network_config.container_mac_addr
);
}
client.previous_lease = Some(lease)
}
Err(err) => log::error!(
"Failed to renew lease for {}: {err}",
&client.network_config.container_mac_addr
),
} else {
break;
}
}
}

fn update_lease_ip(
netns: &str,
interface: &str,
Expand Down