Merge pull request #2350 from AI-Hypercomputer:aireen/gemma3-multi-image

Google-ML-Automation · Google-ML-Automation · commit 0a364fe243ee · 2025-09-18T17:58:23.000-07:00
PiperOrigin-RevId: 808811878
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -798,9 +798,13 @@ freeze_vision_encoder_params: True
 dtype_mm: "float32"  # Data type for multimodal model's vision encoder
 remat_policy_for_vit: "minimal"  # Remat policy for multimodal model's vision encoder. Check `remat_policy` for options.
 image_size_for_vit: 896 # Default for Gemma3, and should be overwritten by model's config
-image_path: "" # Local image path used for decoding
+image_path: "" # Local image path used for decoding, can be multiple paths separated by comma, exp "/path/image1.jpg,/path/image2.jpg"
 image_placeholder: "<|image|>"
 posemb_type_for_vit: "learn"
+# max_num_images_per_example only applies for training when your image column is a list of images.
+# -1 means no limit, and will pad to the max possible number of images determined by sequence length.
+# Set it to avoid unnecessary padding if you know the maximum number of images per example.
+max_num_images_per_example: -1
 
 ### llama4 multi modal configs
 hidden_size_for_vit: 1408
diff --git a/src/MaxText/decode.py b/src/MaxText/decode.py
@@ -16,7 +16,7 @@
 
 import os
 from typing import Sequence
-
+import numpy as np
 import jax
 import jax.numpy as jnp
 
@@ -101,13 +101,14 @@ def main(argv: Sequence[str]) -> None:
   prefill_length = config.max_prefill_predict_length
   processor_output = multimodal_utils.PreprocessorOutput()
   if config.use_multimodal:
-    text = multimodal_utils.reformat_prompt(
-        text, image_placeholder=config.image_placeholder, model_name=config.model_name
+    image_path = config.image_path.split(",")
+    images = [multimodal_utils.load_image_from_path(p) for p in image_path]
+    processor_outputs = [multimodal_utils.pre_process_image(img, model_name=config.model_name) for img in images]
+    image_offsets = sum(
+        [multimodal_utils.get_image_offsets(config.model_name, processor_output=po) for po in processor_outputs]
     )
-    # TODO(hengtaoguo): Support multiple images as input.
-    images = multimodal_utils.load_image_from_path(config.image_path)
-    processor_output = multimodal_utils.pre_process_image(images, model_name=config.model_name)
-    prefill_length -= multimodal_utils.get_image_offsets(config.model_name, processor_output=processor_output)
+    prefill_length -= image_offsets
+    text = multimodal_utils.reformat_prompt(text, image_placeholder=config.image_placeholder, model_name=config.model_name, num_images=len(images))
 
   metadata = engine.get_tokenizer()
   tokenizer_model = engine.build_tokenizer(metadata)
@@ -119,9 +120,9 @@ def main(argv: Sequence[str]) -> None:
   tokens, true_length = tokenizer_model.encode(text, is_bos=not has_chat_template, prefill_lengths=[prefill_length])
   if config.use_multimodal:
     tokens = multimodal_utils.prepare_text_for_image_fusion(
-        tokens, model_name=config.model_name, processor_output=processor_output
+        tokens, model_name=config.model_name, processor_output=processor_outputs
     )
-    true_length += multimodal_utils.get_image_offsets(config.model_name, processor_output=processor_output)
+    true_length += image_offsets
 
   assert (
       true_length <= config.max_prefill_predict_length
@@ -147,7 +148,7 @@ def main(argv: Sequence[str]) -> None:
       prefill_result, first_token = engine.prefill(
           params=params,
           padded_tokens=tokens,
-          images=processor_output.pixel_values,
+          images=np.stack([po.pixel_values for po in processor_outputs]) if config.use_multimodal else None,
           true_length=true_length,
           rng=rng_prefill,
           slot=i,
diff --git a/src/MaxText/experimental/rl/grpo_input_pipeline.py b/src/MaxText/experimental/rl/grpo_input_pipeline.py
@@ -169,7 +169,7 @@ def lists2array(x):
 
   operations = [
       grain.MapOperation(lists2array),
-      _input_pipeline_utils.PadOrTrimToMaxLength(max_target_length),
+      _input_pipeline_utils.PadOrTrimToMaxLength(max_target_length, add_true_length=True),
       grain.Batch(batch_size=global_batch_size // jax.process_count(), drop_remainder=drop_remainder),
   ]
 
diff --git a/src/MaxText/input_pipeline/_grain_data_processing.py b/src/MaxText/input_pipeline/_grain_data_processing.py
@@ -133,7 +133,7 @@ def pretrain_preprocessing_pipeline(dataset, config, data_columns, tokenize, gra
     }
     dataset = dataset.map(_input_pipeline_utils.Rekey(rekey_dict))
   else:
-    dataset = dataset.map(_input_pipeline_utils.PadToMaxLength(config.max_target_length, pad_id))
+    dataset = dataset.map(_input_pipeline_utils.PadOrTrimToMaxLength(config.max_target_length, pad_id))
   batch_fn = functools.partial(grain.experimental.batch_and_pad, batch_size=batch_size, pad_value=pad_id)
   dataset = dataset.batch(batch_size, batch_fn=batch_fn)
 
@@ -175,7 +175,7 @@ def dpo_preprocessing_pipeline(dataset, config, data_columns, tokenize, grain_wo
         )
     )
 
-  dataset = dataset.map(_input_pipeline_utils.PadToMaxLength(config.max_target_length, pad_id))
+  dataset = dataset.map(_input_pipeline_utils.PadOrTrimToMaxLength(config.max_target_length, pad_id))
   batch_size = config.global_batch_size_to_load // jax.process_count()
   batch_fn = functools.partial(grain.experimental.batch_and_pad, batch_size=batch_size, pad_value=pad_id)
   dataset = dataset.batch(batch_size, batch_fn=batch_fn)
diff --git a/src/MaxText/input_pipeline/_hf_data_processing.py b/src/MaxText/input_pipeline/_hf_data_processing.py
@@ -43,11 +43,14 @@ def vision_sft_preprocessing_pipeline(
   """pipeline for multimodal SFT with HF dataset"""
 
   assert len(text_columns) == 2, f"Need two text_columns for query and response, received {text_columns=}"
-
+  batch_size = global_batch_size // jax.process_count()
   if config.enable_data_shuffling:
     dataset = dataset.shuffle(seed=config.data_shuffle_seed)
 
   dataset = dataset.select_columns(text_columns + [image_column])
+  if image_column != "images":
+    dataset = dataset.rename_column(image_column, "images")
+  
   dataset = dataset.map(
       _input_pipeline_utils.reformat_prompt,
       fn_kwargs={
@@ -60,8 +63,6 @@ def vision_sft_preprocessing_pipeline(
       _input_pipeline_utils.reformat_response,
       fn_kwargs={"column": text_columns[1], "model_name": config.model_name},
   )
-  if image_column != "images":
-    dataset = dataset.rename_column(image_column, "images")
 
   dataset = dataset.map(
       _input_pipeline_utils.pre_process_image_sft,
@@ -85,6 +86,7 @@ def vision_sft_preprocessing_pipeline(
   dataset = dataset.map(
       _input_pipeline_utils.tokenization,
       batched=True,
+      batch_size=global_batch_size,
       fn_kwargs={
           "hf_tokenizer": tokenizer,
           "truncation": False,
@@ -115,8 +117,15 @@ def vision_sft_preprocessing_pipeline(
       )
   )
   # TODO(aireenmei, hengtaoguo): support packing
-  operations.append(_input_pipeline_utils.PadToMaxLength(config.max_target_length, pad_id))
-  operations.append(grain.Batch(batch_size=global_batch_size // jax.process_count(), drop_remainder=True))
+  operations.append(
+      _input_pipeline_utils.PadOrTrimToMaxLength(
+          config.max_target_length,
+          pad_id,
+          model_name=config.model_name,
+          max_num_images_per_example=config.max_num_images_per_example,
+      )
+  )
+  operations.append(grain.Batch(batch_size=batch_size, drop_remainder=True))
   operations.append(_input_pipeline_utils.ShiftData(ignored_ids=[pad_id], axis=1))
   dummy_index_sampler = grain.IndexSampler(
       num_records=len(dataset),
@@ -134,7 +143,7 @@ def vision_sft_preprocessing_pipeline(
       sampler=dummy_index_sampler,
       worker_count=1,  # only supports <=1 for now, more workers results in duplicated data
       worker_buffer_size=1,
-      read_options=grain.ReadOptions(num_threads=1, prefetch_buffer_size=128),
+      read_options=grain.ReadOptions(num_threads=1, prefetch_buffer_size=batch_size * 4),
   )
 
   multihost_gen = multihost_dataloading.MultiHostDataLoadIterator(dataloader, global_mesh)
@@ -274,7 +283,7 @@ def lists2array(x):
     )
     operations.append(_input_pipeline_utils.ReformatPacking(data_column_names))
   else:
-    operations.append(_input_pipeline_utils.PadToMaxLength(max_target_length, pad_id))
+    operations.append(_input_pipeline_utils.PadOrTrimToMaxLength(max_target_length, pad_id))
     operations.append(grain.Batch(batch_size=global_batch_size // jax.process_count(), drop_remainder=drop_remainder))
 
   if shift and not use_dpo:
diff --git a/src/MaxText/input_pipeline/_input_pipeline_utils.py b/src/MaxText/input_pipeline/_input_pipeline_utils.py
@@ -68,7 +68,11 @@ def add_segmentation_and_position(x, data_columns, padding_token=0):
 
 def reformat_prompt(example, column, image_placeholder, model_name):
   """reformat prompt for multimodal SFT"""
-  example[column] = multimodal_utils.reformat_prompt(example[column], image_placeholder, model_name)
+  if isinstance(example["images"], list):
+    num_images = len(example["images"])
+  else:
+    num_images = 1
+  example[column] = multimodal_utils.reformat_prompt(example[column], image_placeholder, model_name, num_images)
   return example
 
 
@@ -80,11 +84,19 @@ def reformat_response(example, column, model_name):
 
 def pre_process_image_sft(example, image_column, model_name):
   """pre-process image for multimodal SFT"""
-  image = multimodal_utils.convert_to_RGB(example[image_column])
-  # TODO(aireenmei, hengtaoguo): add support for different image sizes
-  image = multimodal_utils.resize_image(image, model_name)
-  image = np.array(image)
-  example[image_column] = multimodal_utils.pre_process_image(image, model_name)
+
+  def _process_image_fn(image):
+    image = multimodal_utils.convert_to_RGB(image)
+    # TODO(aireenmei, hengtaoguo): add support for different image sizes
+    image = multimodal_utils.resize_image(image, model_name)
+    image = np.array(image)
+    image = multimodal_utils.pre_process_image(image, model_name)
+    return image
+
+  if isinstance(example[image_column], list):
+    example[image_column] = [_process_image_fn(img) for img in example[image_column]]
+  else:
+    example[image_column] = _process_image_fn(example[image_column])
   return example
 
 
@@ -93,7 +105,10 @@ def prepare_text_for_image_fusion(example, column_name, model_name):
   example[column_name] = multimodal_utils.prepare_text_for_image_fusion(
       example[column_name], model_name, processor_output=example["images"]
   )
-  example["images"] = example["images"].pixel_values
+  if isinstance(example["images"], list):
+    example["images"] = [image.pixel_values for image in example["images"]]
+  else:
+    example["images"] = example["images"].pixel_values
   return example
 
 
@@ -400,58 +415,58 @@ def map(self, element):
 
 @dataclasses.dataclass
 class PadOrTrimToMaxLength(grain.MapTransform):
-  """Pads/Trims each input to the specified length
-  and returns true_length of input
-  """
-
-  def __init__(self, max_length):
-    self.max_length = max_length
-
-  def map(self, element: dict[str, np.ndarray]):
-    """map to each element"""
-
-    def _pad(x, max_length):
-      pad_amount = max(max_length - x.shape[0], 0)
-      pad_amount = [(0, pad_amount)] + [(0, 0)] * (len(x.shape) - 1)
-      return np.pad(x, pad_amount)[:max_length]
-
-    data_columns = list(element.keys())
-    for data_column in data_columns:
-      element[f"{data_column}_segmentation"] = (element[data_column] != 0).astype(np.int32)
-      element[f"{data_column}_position"] = np.arange(element[data_column].shape[0], dtype=np.int32)
-      element[f"{data_column}_true_length"] = np.array([element[data_column].shape[0]], dtype=np.int32)
-    for key, _ in element.items():
-      if "true_length" not in key:
-        element[key] = _pad(element[key], self.max_length)
-    # for data_column in data_columns:
-    #   data[f"{data_column}_true_length"] = _max_true_length(data[data_column], 0)
-    return element
-
+  """Pads or trims each input to the specified length.
+  And optionally add true length for the input."""
 
-@dataclasses.dataclass
-class PadToMaxLength(grain.MapTransform):
-  """Pads each input to the specified length"""
-
-  def __init__(self, max_length, pad_id):
+  def __init__(self, max_length, pad_id=0, model_name=None, add_true_length=False, max_num_images_per_example=-1):
     self.max_length = max_length
     self.pad_id = pad_id
+    self.model_name = model_name
+    self.add_true_length = add_true_length
+    self.max_num_images_per_example = max_num_images_per_example
+
+  def _pad_text(self, x, max_length, pad_id):
+    pad_amount = max(max_length - x.shape[0], 0)
+    pad_amount = [(0, pad_amount)] + [(0, 0)] * (len(x.shape) - 1)
+    return np.pad(x, pad_amount, constant_values=pad_id)[: self.max_length]
+
+  def _pad_image(self, images):
+    image_offsets = multimodal_utils.get_image_offsets(self.model_name, None)
+    max_num_images = (self.max_length // image_offsets) -1  # -1 to reserve space for at least one text token
+    if self.max_num_images_per_example > 0:
+      max_num_images = min(self.max_num_images_per_example, max_num_images)
+    image_shape = multimodal_utils.get_dummy_image_shape_for_init(self.model_name)[2:]
+    assert (
+        images.shape[0] <= max_num_images
+    ), f"Number of images {images.shape[0]} exceeds the maximum allowed {max_num_images}"
+    if images.shape[0] < max_num_images:
+      pad_size = max_num_images - images.shape[0]
+      pad_shape = (pad_size,) + image_shape
+      pad_images = np.zeros(pad_shape, dtype=images.dtype)
+      if images is not None and images.size > 0:
+        images = np.concatenate([images, pad_images], axis=0)
+      else:
+        images = pad_images
+    return images
 
   def map(self, element: dict[str, np.ndarray]):
     """map to each element"""
-
-    def _pad(x, max_length, pad_id):
-      pad_amount = max(max_length - x.shape[0], 0)
-      pad_amount = [(0, pad_amount)] + [(0, 0)] * (len(x.shape) - 1)
-      return np.pad(x, pad_amount, constant_values=pad_id)
-
     data_columns = list(element.keys())
     for data_column in data_columns:
       if data_column != "images":
         element[f"{data_column}_segmentation"] = (element[data_column] != self.pad_id).astype(np.int32)
         element[f"{data_column}_position"] = np.arange(element[data_column].shape[0], dtype=np.int32)
+        if self.add_true_length:
+          element[f"{data_column}_true_length"] = np.array([element[data_column].shape[0]], dtype=np.int32)
     for key, _ in element.items():
-      if key != "images":
-        element[key] = _pad(element[key], self.max_length, self.pad_id)
+      if key == "images":
+        if isinstance(element["images"], list):
+          assert self.model_name is not None, "model_name must be provided when padding images"
+          element["images"] = self._pad_image(np.asarray(element["images"]))
+        else:
+          element["images"] = np.asarray(element["images"])[None, ...]
+      elif "true_length" not in key:
+        element[key] = self._pad_text(element[key], self.max_length, self.pad_id)
     return element
 
 
diff --git a/src/MaxText/maxengine.py b/src/MaxText/maxengine.py
@@ -46,6 +46,7 @@
 from MaxText import inference_utils
 from MaxText import max_utils
 from MaxText import maxtext_utils
+from MaxText import multimodal_utils
 from MaxText import pyconfig
 from MaxText.common_types import MODEL_MODE_PREFILL, DECODING_ACTIVE_SEQUENCE_INDICATOR, MODEL_MODE_AUTOREGRESSIVE
 from MaxText.globals import MAXTEXT_PKG_DIR
@@ -331,7 +332,7 @@ def model_apply(_p, _rng):
           jnp.ones((1, self.config.max_prefill_predict_length), dtype=jnp.int32),
           jnp.ones((1, self.config.max_prefill_predict_length), dtype=jnp.int32),
           encoder_images=jnp.ones(
-              maxtext_utils.get_dummy_image_shape_for_init(self.config),
+              multimodal_utils.get_dummy_image_shape_for_init(self.config.model_name, batch_size=self.config.micro_batch_size_to_train_on),
               dtype=jnp.float32,
           )
           if self.config.use_multimodal
@@ -474,10 +475,12 @@ def _prefill_jit(
 
     input_images = None
     if self.config.use_multimodal and images is not None:
-      if self.config.model_name.startswith("gemma3"):
-        input_images = images[jnp.newaxis, jnp.newaxis, ...]  # Add batch and sequence dimension [B, N, H, W, C]
-      elif self.config.model_name.startswith("llama4"):
-        input_images = images[jnp.newaxis, ...]  # Add batch dimension [B, T, C, H, W]
+      if images.ndim == 3:
+        # For Gemma3 single image, add batch and image count dimensions
+        input_images = images[jnp.newaxis, jnp.newaxis, ...]
+      elif images.ndim == 4:
+        # add batch dimension
+        input_images = images[jnp.newaxis, ...]
 
     # sequence_indicator will be concatenated to existing_prefix decoder_segment_ids
     start_to_n = jnp.arange(start_position, start_position + input_tokens.shape[1])
@@ -1524,7 +1527,7 @@ def init(abstract_params, page_state):
           (int(self.config.per_device_batch_size * self.mesh.size), 1),
           dtype=jnp.int32,
       )
-      dummy_image = jnp.ones(maxtext_utils.get_dummy_image_shape_for_init(self.config), dtype=jnp.int32)
+      dummy_image = jnp.ones(multimodal_utils.get_dummy_image_shape_for_init(self.config.model_name, batch_size=self.config.micro_batch_size_to_train_on), dtype=jnp.int32)
       _, cache = self.model.apply(
           abstract_params,
           x,
diff --git a/src/MaxText/maxtext_utils.py b/src/MaxText/maxtext_utils.py
diff --git a/src/MaxText/multimodal_utils.py b/src/MaxText/multimodal_utils.py
diff --git a/tests/multimodal_utils_test.py b/tests/multimodal_utils_test.py

Original file line number	Diff line number	Diff line change
`@@ -169,7 +169,7 @@ def lists2array(x):`
`169`	`169`
`170`	`170`	`operations = [`
`171`	`171`	`grain.MapOperation(lists2array),`
`172`		`- _input_pipeline_utils.PadOrTrimToMaxLength(max_target_length),`
	`172`	`+ _input_pipeline_utils.PadOrTrimToMaxLength(max_target_length, add_true_length=True),`
`173`	`173`	`grain.Batch(batch_size=global_batch_size // jax.process_count(), drop_remainder=drop_remainder),`
`174`	`174`	`]`
`175`	`175`
Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ def pretrain_preprocessing_pipeline(dataset, config, data_columns, tokenize, gra`
`133`	`133`	`}`
`134`	`134`	`dataset = dataset.map(_input_pipeline_utils.Rekey(rekey_dict))`
`135`	`135`	`else:`
`136`		`- dataset = dataset.map(_input_pipeline_utils.PadToMaxLength(config.max_target_length, pad_id))`
	`136`	`+ dataset = dataset.map(_input_pipeline_utils.PadOrTrimToMaxLength(config.max_target_length, pad_id))`
`137`	`137`	`batch_fn = functools.partial(grain.experimental.batch_and_pad, batch_size=batch_size, pad_value=pad_id)`
`138`	`138`	`dataset = dataset.batch(batch_size, batch_fn=batch_fn)`
`139`	`139`
`@@ -175,7 +175,7 @@ def dpo_preprocessing_pipeline(dataset, config, data_columns, tokenize, grain_wo`
`175`	`175`	`)`
`176`	`176`	`)`
`177`	`177`
`178`		`- dataset = dataset.map(_input_pipeline_utils.PadToMaxLength(config.max_target_length, pad_id))`
	`178`	`+ dataset = dataset.map(_input_pipeline_utils.PadOrTrimToMaxLength(config.max_target_length, pad_id))`
`179`	`179`	`batch_size = config.global_batch_size_to_load // jax.process_count()`
`180`	`180`	`batch_fn = functools.partial(grain.experimental.batch_and_pad, batch_size=batch_size, pad_value=pad_id)`
`181`	`181`	`dataset = dataset.batch(batch_size, batch_fn=batch_fn)`