debug logs

RaymondLi0 · RaymondLi0 · commit 874cb2a875a4 · 2025-08-11T18:57:06.000Z
diff --git a/fast_llm/data/dataset/gpt/sampled.py b/fast_llm/data/dataset/gpt/sampled.py
@@ -2,6 +2,7 @@
 import logging
 import math
 import pathlib
+import time
 import typing
 import warnings
 
@@ -420,6 +421,7 @@ def __getitem__(self, index: int) -> typing.Any:
         The returned sample is ready to be concatenated, then fed to a `GPTModel` (see `GPTModel.preprocess`).
         """
         self._lazy_load()
+        start_time = time.perf_counter()
 
         if self._parameters.use_preference_loss_spans:
             if index < self._unshuffled_documents:
@@ -649,6 +651,13 @@ def __getitem__(self, index: int) -> typing.Any:
         image_positions = np.array(image_positions) if image_positions else None
         Assert.eq(len(token_ids), self._parameters.sequence_length + self._parameters.extra_tokens)
 
+        data_time = (time.perf_counter() - start_time) * 1000
+        if data_time > 100:
+            logger.warning(
+                f"Data loading took {data_time:,.2f} ms for {image_tokens_added} image tokens and "
+                f"{text_tokens_added} text tokens. {len(images) if images else 0} images and {len(token_ids)} total tokens."
+            )
+
         return GPTSample(
             token_ids=token_ids,
             loss_masking_spans=loss_masking_spans,
diff --git a/fast_llm/models/gpt/model.py b/fast_llm/models/gpt/model.py
@@ -1,4 +1,5 @@
 import logging
+import time
 import typing
 
 import torch
@@ -332,11 +333,14 @@ def preprocess(
                 batch, reference_preprocessed_meta, phase=PhaseType.inference, iteration=iteration
             )
 
+            start_time = time.perf_counter()
             # TODO: Do things work with >1?
             Assert.eq(len(reference_batch), len(preprocessed_meta), 1)
             for i, (reference_tokens, reference_kwargs) in enumerate(reference_batch):
                 reference_model.forward(reference_tokens, reference_kwargs, iteration=iteration)
                 reference_logits[i][f"{name}_logits"] = reference_kwargs["logits"]
+            elapsed_time = (time.perf_counter() - start_time) * 1000
+            logger.info(f"Ref model {name} took {elapsed_time:.2f} ms for {len(reference_batch)} sequences.")
 
         token_ids = batch.token_ids
         if sequence_first: