Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 167 additions & 3 deletions nomad/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def _group_centroid(g):
w = None
return _centroid(pts, metric=metric, weight=w)

cent = stops.groupby(keys).apply(_group_centroid, include_groups=False)
cent = stops.groupby(keys).apply(_group_centroid)

cent_df = pd.DataFrame(
cent.tolist(),
Expand Down Expand Up @@ -185,5 +185,169 @@ def _group_rog(g):
else:
return np.sqrt(g['d2'].mean())

rog = stops.groupby(keys).apply(_group_rog, include_groups=False)
return rog.reset_index(name='rog')
rog = stops.groupby(keys).apply(_group_rog)
return rog.reset_index(name='rog')

def self_containment(stops, threshold, agg_freq='d', weighted=True, home_activity_type='home',
traj_cols=None, time_weights=None,
exploded=True, **kwargs):
"""
Compute self-containment (proportion of non-home time spent within threshold distance from home).

Self-containment describes the propensity of individuals to stay close to home. It is calculated
as the time-weighted proportion of non-home activities that are within a threshold distance from home.

Parameters
----------
stops : pd.DataFrame
Stop data with spatial coordinates, duration, and location_id.
threshold : float
Distance threshold in the same units as coordinates (meters for projected, degrees for lat/lon).
Activities within this distance from home are considered "contained".
agg_freq : str
Pandas offset alias for time-bucketing (e.g. 'd','w','m').
weighted : bool
If True, weight by duration; else unweighted (count activities).
home_activity_type : str
Value in location_id column that identifies home locations. Default is 'home'.
Can be 'home_id' or any other location_id value.
traj_cols : dict, optional
Mapping for x/y (or lon/lat), timestamp/datetime, duration, user_id, location_id.
time_weights : pd.Series, optional
Additional time weights to multiply with duration (if weighted=True).
exploded : bool
If True, explode stops that straddle multiple time periods. Default is True.
**kwargs
Additional arguments passed to explode_stops or column overrides.

Returns
-------
pd.DataFrame
Columns = [period, user_id?, self_containment].
self_containment is the proportion [0, 1] of non-home time spent within threshold from home.
"""
stops = stops.copy()

# Restrict agg_freq to days and weeks only
allowed_freqs = ['d', 'w', 'D', 'W']
if agg_freq not in allowed_freqs:
raise ValueError(f"agg_freq must be one of {allowed_freqs} (got '{agg_freq}')")

# Parse column mappings (similar to compute_candidate_homes)
traj_cols = loader._parse_traj_cols(stops.columns, traj_cols, kwargs)

# Check for required location_id column
if traj_cols["location_id"] not in stops.columns:
raise ValueError(f"Missing required '{traj_cols['location_id']}' column")

# Warn if no home locations exist in the entire dataset
if (stops[traj_cols["location_id"]] == home_activity_type).sum() == 0:
warnings.warn(
f"No home locations found (location_id == '{home_activity_type}'). "
f"Self-containment cannot be calculated without home locations.",
UserWarning
)

# Add time_weights column if provided
if time_weights is not None:
if isinstance(time_weights, pd.Series) and (len(time_weights) == len(stops)):
stops['time_weights'] = time_weights
else:
raise ValueError("time_weights must be a pd.Series with the same length and index as stops.")

if exploded:
stops = utils.explode_stops(stops, agg_freq=agg_freq, **kwargs)
warnings.warn(
f"Some stops straddle multiple {agg_freq.upper()}s. They will be exploded into separate rows.",
UserWarning
)

# 1) Column mapping + check
t_key, coord_x, coord_y, use_datetime, use_lon_lat = utils._fallback_st_cols(stops.columns, traj_cols, kwargs)
dur_key = traj_cols['duration']
if dur_key not in stops.columns:
raise ValueError("Missing required 'duration' column")

# 2) Time buckets
if use_datetime:
temp_dt = stops[traj_cols[t_key]]
else:
temp_dt = pd.to_datetime(stops[traj_cols[t_key]], unit='s')
if agg_freq == "W":
agg_freq = "W-MON"

stops['period'] = temp_dt.dt.to_period(agg_freq).dt.start_time

# 3) Grouping keys
keys = ['period']
uid_key = traj_cols['user_id']
if uid_key in stops.columns:
keys.append(uid_key)

# 4) Calculate distance from home for each group
metric = 'haversine' if use_lon_lat else 'euclidean'

# Initialize distance column
stops['dist_from_home'] = np.nan

# Calculate for each group
for group_keys_tuple, group_df in stops.groupby(keys):
home_stops = group_df[group_df[traj_cols["location_id"]] == home_activity_type]

if len(home_stops) == 0:
# No home location in this group
continue

# Use the first home location as reference
if metric == 'haversine':
home_coords = home_stops[[traj_cols['latitude'], traj_cols['longitude']]].iloc[0].values
home_coords_rad = np.radians(home_coords)

# Calculate distance for each point in group
for idx, row in group_df.iterrows():
point_coords = np.radians([row[traj_cols['latitude']], row[traj_cols['longitude']]])
stops.loc[idx, 'dist_from_home'] = utils._haversine_distance(point_coords, home_coords_rad)
else:
home_coords = home_stops[[coord_x, coord_y]].iloc[0].values
dx = group_df[coord_x] - home_coords[0]
dy = group_df[coord_y] - home_coords[1]
distances = np.sqrt(dx*dx + dy*dy)
stops.loc[group_df.index, 'dist_from_home'] = distances.values

# 5) Calculate self-containment per group
def _group_self_containment(g):
"""Calculate self-containment for a group (time period + user)."""
# If no activities at all, return NaN
if len(g) == 0:
return np.nan

# Filter for non-home activities
non_home = g[g[traj_cols["location_id"]] != home_activity_type]

if len(non_home) == 0:
return 1.0 # No non-home activities = perfectly contained at home

# Check which are within threshold
within_threshold = non_home['dist_from_home'] <= threshold

if weighted:
# Calculate weights
if time_weights is not None:
weights = non_home[dur_key] * time_weights.loc[non_home.index]
elif 'time_weights' in non_home.columns:
weights = non_home[dur_key] * non_home['time_weights']
else:
weights = non_home[dur_key]

# Time-weighted proportion
total_weight = weights.sum()
if total_weight == 0:
return np.nan
within_weight = (weights * within_threshold).sum()
return within_weight / total_weight
else:
# Unweighted proportion (count of activities)
return within_threshold.sum() / len(non_home)

result = stops.groupby(keys).apply(_group_self_containment)
return result.reset_index(name='self_containment')
Loading