Watts-Lab · carolineychen8 · Nov 23, 2025 · Dec 1, 2025 · Dec 7, 2025
diff --git a/nomad/metrics/metrics.py b/nomad/metrics/metrics.py
@@ -149,7 +149,7 @@ def _group_centroid(g):
             w = None
         return _centroid(pts, metric=metric, weight=w)
 
-    cent = stops.groupby(keys).apply(_group_centroid, include_groups=False)
+    cent = stops.groupby(keys).apply(_group_centroid)
 
     cent_df = pd.DataFrame(
         cent.tolist(),
@@ -185,5 +185,169 @@ def _group_rog(g):
         else:
             return np.sqrt(g['d2'].mean())
 
-    rog = stops.groupby(keys).apply(_group_rog, include_groups=False)
-    return rog.reset_index(name='rog')
+    rog = stops.groupby(keys).apply(_group_rog)
+    return rog.reset_index(name='rog')
+
+def self_containment(stops, threshold, agg_freq='d', weighted=True, home_activity_type='home',
+                     traj_cols=None, time_weights=None,
+                     exploded=True, **kwargs):
+    """
+    Compute self-containment (proportion of non-home time spent within threshold distance from home).
+
+    Self-containment describes the propensity of individuals to stay close to home. It is calculated
+    as the time-weighted proportion of non-home activities that are within a threshold distance from home.
+
+    Parameters
+    ----------
+    stops : pd.DataFrame
+        Stop data with spatial coordinates, duration, and location_id.
+    threshold : float
+        Distance threshold in the same units as coordinates (meters for projected, degrees for lat/lon).
+        Activities within this distance from home are considered "contained".
+    agg_freq : str
+        Pandas offset alias for time-bucketing (e.g. 'd','w','m').
+    weighted : bool
+        If True, weight by duration; else unweighted (count activities).
+    home_activity_type : str
+        Value in location_id column that identifies home locations. Default is 'home'.
+        Can be 'home_id' or any other location_id value.
+    traj_cols : dict, optional
+        Mapping for x/y (or lon/lat), timestamp/datetime, duration, user_id, location_id.
+    time_weights : pd.Series, optional
+        Additional time weights to multiply with duration (if weighted=True).
+    exploded : bool
+        If True, explode stops that straddle multiple time periods. Default is True.
+    **kwargs
+        Additional arguments passed to explode_stops or column overrides.
+
+    Returns
+    -------
+    pd.DataFrame
+        Columns = [period, user_id?, self_containment].
+        self_containment is the proportion [0, 1] of non-home time spent within threshold from home.
+    """
+    stops = stops.copy()
+
+    # Restrict agg_freq to days and weeks only
+    allowed_freqs = ['d', 'w', 'D', 'W']
+    if agg_freq not in allowed_freqs:
+        raise ValueError(f"agg_freq must be one of {allowed_freqs} (got '{agg_freq}')")
+
+    # Parse column mappings (similar to compute_candidate_homes)
+    traj_cols = loader._parse_traj_cols(stops.columns, traj_cols, kwargs)
+
+    # Check for required location_id column
+    if traj_cols["location_id"] not in stops.columns:
+        raise ValueError(f"Missing required '{traj_cols['location_id']}' column")
+
+    # Warn if no home locations exist in the entire dataset
+    if (stops[traj_cols["location_id"]] == home_activity_type).sum() == 0:
+        warnings.warn(
+            f"No home locations found (location_id == '{home_activity_type}'). "
+            f"Self-containment cannot be calculated without home locations.",
+            UserWarning
+        )
+
+    # Add time_weights column if provided
+    if time_weights is not None:
+        if isinstance(time_weights, pd.Series) and (len(time_weights) == len(stops)):
+            stops['time_weights'] = time_weights
+        else:
+            raise ValueError("time_weights must be a pd.Series with the same length and index as stops.")
+
+    if exploded:
+        stops = utils.explode_stops(stops, agg_freq=agg_freq, **kwargs)
+        warnings.warn(
+            f"Some stops straddle multiple {agg_freq.upper()}s. They will be exploded into separate rows.",
+            UserWarning
+        )
+
+    # 1) Column mapping + check
+    t_key, coord_x, coord_y, use_datetime, use_lon_lat = utils._fallback_st_cols(stops.columns, traj_cols, kwargs)
+    dur_key = traj_cols['duration']
+    if dur_key not in stops.columns:
+        raise ValueError("Missing required 'duration' column")
+
+    # 2) Time buckets
+    if use_datetime:
+        temp_dt = stops[traj_cols[t_key]]
+    else:
+        temp_dt = pd.to_datetime(stops[traj_cols[t_key]], unit='s')
+    if agg_freq == "W":
+        agg_freq = "W-MON"
+
+    stops['period'] = temp_dt.dt.to_period(agg_freq).dt.start_time
+
+    # 3) Grouping keys
+    keys = ['period']
+    uid_key = traj_cols['user_id']
+    if uid_key in stops.columns:
+        keys.append(uid_key)
+
+    # 4) Calculate distance from home for each group
+    metric = 'haversine' if use_lon_lat else 'euclidean'
+
+    # Initialize distance column
+    stops['dist_from_home'] = np.nan
+
+    # Calculate for each group
+    for group_keys_tuple, group_df in stops.groupby(keys):
+        home_stops = group_df[group_df[traj_cols["location_id"]] == home_activity_type]
+
+        if len(home_stops) == 0:
+            # No home location in this group
+            continue
+
+        # Use the first home location as reference
+        if metric == 'haversine':
+            home_coords = home_stops[[traj_cols['latitude'], traj_cols['longitude']]].iloc[0].values
+            home_coords_rad = np.radians(home_coords)
+
+            # Calculate distance for each point in group
+            for idx, row in group_df.iterrows():
+                point_coords = np.radians([row[traj_cols['latitude']], row[traj_cols['longitude']]])
+                stops.loc[idx, 'dist_from_home'] = utils._haversine_distance(point_coords, home_coords_rad)
+        else:
+            home_coords = home_stops[[coord_x, coord_y]].iloc[0].values
+            dx = group_df[coord_x] - home_coords[0]
+            dy = group_df[coord_y] - home_coords[1]
+            distances = np.sqrt(dx*dx + dy*dy)
+            stops.loc[group_df.index, 'dist_from_home'] = distances.values
+
+    # 5) Calculate self-containment per group
+    def _group_self_containment(g):
+        """Calculate self-containment for a group (time period + user)."""
+        # If no activities at all, return NaN
+        if len(g) == 0:
+            return np.nan
+
+        # Filter for non-home activities
+        non_home = g[g[traj_cols["location_id"]] != home_activity_type]
+
+        if len(non_home) == 0:
+            return 1.0  # No non-home activities = perfectly contained at home
+
+        # Check which are within threshold
+        within_threshold = non_home['dist_from_home'] <= threshold
+
+        if weighted:
+            # Calculate weights
+            if time_weights is not None:
+                weights = non_home[dur_key] * time_weights.loc[non_home.index]
+            elif 'time_weights' in non_home.columns:
+                weights = non_home[dur_key] * non_home['time_weights']
+            else:
+                weights = non_home[dur_key]
+
+            # Time-weighted proportion
+            total_weight = weights.sum()
+            if total_weight == 0:
+                return np.nan
+            within_weight = (weights * within_threshold).sum()
+            return within_weight / total_weight
+        else:
+            # Unweighted proportion (count of activities)
+            return within_threshold.sum() / len(non_home)
+
+    result = stops.groupby(keys).apply(_group_self_containment)
+    return result.reset_index(name='self_containment')