source filename tracking when using open_mfdataset() #8972
-
| Is there a way for xarray to keep track of each source filename when using  import pandas as pd
import xarray as xr
from datetime import timedelta
# Each file contains multiple variables that differ across
files = ['file1.zarr',  'file2.zarr', 'file3.zarr', 'file4.zarr', 'file5.zarr', 'file6.zarr']
ref_data_path = 'data.csv'
# open the csv file:
master = pd.read_csv(ref_data_path)
# open the galwem files as a set
mf = xr.open_mfdataset(files, engine='zarr', parallel=True)
# Define the time tolerance (5 minutes)
tolerance = timedelta(minutes=15)
def build_timestamp(timeid):
    time_segs = timeid.split('_')[:2]
    assert len(time_segs) == 2, 'Ref time string components more than 2'
    ymd = time_segs[0]
    hms = time_segs[1]
    timestamp = pd.Timestamp(
        f'{ymd[:4]}-{ymd[4:6]}-{ymd[6:]} {hms[:2]}:{hms[2:4]}:{hms[4:]}'
        )
    return timestamp
# iterate over the rows in the dataframe
for index, row in master.iterrows():
    print(row)
    # extract time
    ref_time = build_timestamp(row['scene_id'])
    ref_lat = row['lat']
    ref_lon = row['lon']
    ref_alt = row['alt_geom_ft']
    # Use sel() with a tolerance and nearest argument to
    # find the nearest time within the tolerance
    try:
        slot_array = mf.sel(
            time=ref_time, method='nearest', tolerance=tolerance
            )
    except KeyError as ke:
        print(f'No match found for time: {ref_time}')
        print(f'Exception: {ke}')
        continue
    # find file that slot_array is associated with
    # assumes filename (source file) is a global attribute 
    filename = slot_array.attrs['filename']
    pass | 
Beta Was this translation helpful? Give feedback.
Replies: 2 comments 1 reply
-
| You'll have to add the file name using a  def preprocess(ds):
    # something like this with the right dimension name instead of "time"
    # The file name *should* be in `.encoding["source"]`
    ds.coords["filename"] = ("time", ds.encoding["source"])
	return ds
xr.open_mfdataset(..., preprocess=preprocess) | 
Beta Was this translation helpful? Give feedback.
-
| 
 Yes, by adding it explicitly to the dataset as a new coordinate variable (the easiest way being to use  
 Not in the  | 
Beta Was this translation helpful? Give feedback.
You'll have to add the file name using a
preprocessfunction.