Support GCS file pattern in grain

aireenmei · aireenmei · commit 4268f5a5d8f4 · 2025-11-13T01:03:24.000Z
diff --git a/docs/guides/data_input_pipeline/data_input_grain.md b/docs/guides/data_input_pipeline/data_input_grain.md
@@ -41,13 +41,13 @@ MOUNT_PATH=$MOUNT_PATH \
 3. Set `dataset_type=grain`, `grain_file_type={arrayrecord|parquet}`, `grain_train_files` to match the file pattern on the mounted local path.
 4. Tune `grain_worker_count` for performance. This parameter controls the number of child processes used by Grain (more details in [behind_the_scenes](https://google-grain.readthedocs.io/en/latest/behind_the_scenes.html), [grain_pool.py](https://github.com/google/grain/blob/main/grain/_src/python/grain_pool.py)). If you use a large number of workers, check your config for gcsfuse in [setup_gcsfuse.sh](https://github.com/google/maxtext/blob/main/tools/setup/setup_gcsfuse.sh) to avoid gcsfuse throttling.
 
-5. For multi-source blending, you can specify multiple data sources with their respective weights using semicolon (;) as a separator and colon (:) for weights. The weights will be automatically normalized to sum to 1.0. For example:
+5. For multi-source blending, you can specify multiple data sources with their respective weights using semicolon (;) as a separator and a comma (,) for weights. The weights will be automatically normalized to sum to 1.0. For example:
 ```
 # Blend two data sources with 30% from first source and 70% from second source
-grain_train_files=/tmp/gcsfuse/dataset1.array_record*:0.3;/tmp/gcsfuse/dataset2.array_record*:0.7
+grain_train_files=/tmp/gcsfuse/dataset1.array_record*,0.3;/tmp/gcsfuse/dataset2.array_record*,0.7
 
 # Blend three data sources with equal weights (will be normalized to 0.33 each)
-grain_train_files=/tmp/gcsfuse/dataset1.array_record*:1;/tmp/gcsfuse/dataset2.array_record*:1;/tmp/gcsfuse/dataset3.array_record*:1
+grain_train_files=/tmp/gcsfuse/dataset1.array_record*,1;/tmp/gcsfuse/dataset2.array_record*,1;/tmp/gcsfuse/dataset3.array_record*,1
 ```
 Note: When using multiple data sources, only the ArrayRecord format is supported.
 
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -569,8 +569,8 @@ hf_eval_files: ''
 hf_access_token: ''
 # for Grain input pipeline (dataset_type=grain)
 # Path to grain data files. Can be a single pattern or multiple patterns with weights.
-# For multiple patterns, use semicolon (;) to separate and colon (:) to specify weights.
-# Example: "path/to/data1.array_record*:0.3;path/to/data2.array_record*:0.7"
+# For multiple patterns, use semicolon (;) to separate and comma (,) to specify weights.
+# Example: "path/to/data1.array_record*,0.3;path/to/data2.array_record*,0.7"
 # Note: When using multiple files (separated by ';'), only ArrayRecord format is supported.
 # For more details, see https://github.com/google/maxtext/blob/main/getting_started/Data_Input_Pipeline.md#grain-input-pipeline
 grain_train_files: ''
diff --git a/src/MaxText/input_pipeline/_grain_data_processing.py b/src/MaxText/input_pipeline/_grain_data_processing.py
@@ -17,6 +17,8 @@
 import glob
 from pathlib import Path
 import functools
+from google.cloud import storage
+import re
 
 import ml_collections
 
@@ -32,8 +34,22 @@
 
 
 def find_data_files(data_file_pattern):
-  data_files = glob.glob(str(Path(data_file_pattern).expanduser().resolve()))
-  assert len(data_files) > 0, f"No file found with pattern {data_file_pattern}."
+  """Find data files matching the pattern."""
+  if data_file_pattern.startswith("gs://"):
+    storage_client = storage.Client()
+    match = re.match(r"gs://([a-z0-9._-]+)/(.+)", data_file_pattern)
+    if not match:
+      raise ValueError("Invalid GCS path pattern.")
+    bucket_name, glob_pattern = match.groups()
+    blobs = storage_client.list_blobs(bucket_name, match_glob=glob_pattern)
+    data_files = [f"gs://{bucket_name}/{blob.name}" for blob in blobs]
+  else:
+    # Use glob for local files
+    data_files = glob.glob(str(Path(data_file_pattern).expanduser().resolve()))
+
+  if not data_files:
+    raise FileNotFoundError(f"No files found matching pattern: {data_file_pattern}")
+
   max_logging.log(f"Found {len(data_files)} files for train/eval with grain")
   return data_files
 
@@ -51,7 +67,7 @@ def get_datasets(
   """Load dataset from array_record files for using with grain"""
   if data_file_type == "arrayrecord":
     if ";" in data_file_pattern:
-      data_file_patterns, weights = zip(*[pattern.split(":") for pattern in data_file_pattern.split(";")])
+      data_file_patterns, weights = zip(*[pattern.split(",") for pattern in data_file_pattern.split(";")])
       assert len(data_file_patterns) == len(weights), "Number of data file patterns and weights must match"
       weights = [float(weight) for weight in weights]
       weights = [round(weight / sum(weights), 4) for weight in weights]