mlcommons · rka97 · Feb 27, 2025 · Mar 11, 2025 · Mar 12, 2025 · Mar 14, 2025
@@ -25,4 +25,4 @@ scoring/plots/
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_0/eval_measurements.csv
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_1/eval_measurements.csv
 
-algoperf/_version.py
+algoperf/_version.py
@@ -5,14 +5,16 @@
 """
 
 import os
-from typing import Sequence, Tuple
+from typing import Optional, Sequence, Tuple
 
 import numpy as np
+import orbax.checkpoint as ocp
 import torch
 from absl import logging
 from flax import jax_utils
 from flax.training import checkpoints as flax_checkpoints
 from flax.training.checkpoints import latest_checkpoint
+from orbax.checkpoint.type_handlers import NumpyHandler
 from tensorflow.io import gfile  # pytype: disable=import-error
 
 from algoperf import spec
@@ -30,6 +32,51 @@
 ]
 
 
+class BoolHandler(NumpyHandler):
+  """
+  An implementation of TypeHandler for np.bool_ that inherits from NumpyHandler.
+  It works by treating the scalar as a 0-dimensional array.
+  """
+
+  def typestr(self) -> str:
+    """Unique string identifier for this handler."""
+    return 'np.bool_'
+
+  async def serialize(
+    self,
+    values: Sequence[np.bool_],
+    infos: Sequence,
+    args: Optional[Sequence[ocp.SaveArgs]] = None,
+  ):
+    """
+    Serializes a sequence of np.bool_ scalars by first converting them
+    to 0-dim numpy arrays and then calling the parent NumpyHandler.
+    """
+    # Convert each scalar np.bool_ to a 0-dimensional np.ndarray
+    array_values = [np.asarray(v, dtype=np.bool_) for v in values]
+    # Use the parent class's robust serialization logic
+    return await super().serialize(array_values, infos, args)
+
+  async def deserialize(
+    self,
+    infos: Sequence,
+    args: Optional[Sequence[ocp.RestoreArgs]] = None,
+  ) -> Sequence[np.bool_]:
+    """
+    Deserializes into a sequence of np.bool_ scalars by calling the
+    parent handler and then converting the resulting 0-dim arrays.
+    """
+    # Parent deserialize will return a sequence of 0-dimensional np.ndarray
+    results = await super().deserialize(infos, args)
+
+    # Convert each 0-d array back to an np.bool_ scalar using .item()
+    scalar_results = [np.bool_(r.item()) for r in results]
+    return scalar_results
+
+
+ocp.type_handlers.register_type_handler(np.bool_, BoolHandler(), override=True)
+
+
 def maybe_restore_checkpoint(
   framework: str,
   optimizer_state: spec.OptimizerState,

@@ -44,6 +44,8 @@ def pytorch_param_types(
         param_types[name] = spec.ParameterType.ATTENTION_BIAS
       elif 'in_proj' in name:
         param_types[name] = spec.ParameterType.ATTENTION_QKV
+      elif 'qkv' in name:
+        param_types[name] = spec.ParameterType.ATTENTION_QKV
       elif 'kv_proj' in name:
         param_types[name] = spec.ParameterType.ATTENTION_KV
       elif 'k_proj' in name or 'key' in name:

@@ -27,7 +27,9 @@ def pytorch_setup() -> Tuple[bool, int, torch.device, int]:
   return use_pytorch_ddp, rank, device, n_gpus
 
 
-def pytorch_init(use_pytorch_ddp: bool, rank: int, profiler: Profiler) -> None:
+def pytorch_init(
+  use_pytorch_ddp: bool, rank: int, profiler: Profiler, limit_tf_threads=True
+) -> None:
   # Make sure no GPU memory is preallocated to Jax.
   os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'
   # Only use CPU for Jax to avoid memory issues.
@@ -39,7 +41,7 @@ def pytorch_init(use_pytorch_ddp: bool, rank: int, profiler: Profiler) -> None:
 
   if use_pytorch_ddp:
     # Avoid tf input pipeline creating too many threads.
-    if rank != 0:
+    if rank != 0 and limit_tf_threads:
       tf.config.threading.set_intra_op_parallelism_threads(1)
       tf.config.threading.set_inter_op_parallelism_threads(1)
 

@@ -35,13 +35,13 @@ def _signed_to_unsigned(seed: SeedType) -> SeedType:
 
 def _fold_in(seed: SeedType, data: Any) -> List[Union[SeedType, Any]]:
   rng = np.random.RandomState(seed=_signed_to_unsigned(seed))
-  new_seed = rng.randint(MIN_INT32, MAX_INT32, dtype=np.int32)
+  new_seed = rng.randint(MIN_INT32, MAX_INT32, dtype=np.uint32)
   return [new_seed, data]
 
 
 def _split(seed: SeedType, num: int = 2) -> SeedType:
   rng = np.random.RandomState(seed=_signed_to_unsigned(seed))
-  return rng.randint(MIN_INT32, MAX_INT32, dtype=np.int32, size=[num, 2])
+  return rng.randint(MIN_INT32, MAX_INT32, dtype=np.uint32, size=[num, 2])
 
 
 def _PRNGKey(seed: SeedType) -> SeedType:  # pylint: disable=invalid-name

@@ -0,0 +1,153 @@
+"""Input pipeline for a LM dataset."""
+
+import functools
+import os
+from typing import Optional
+
+import jax
+import tensorflow as tf
+
+from algoperf import data_utils
+
+AUTOTUNE = tf.data.experimental.AUTOTUNE
+PAD_ID = tf.constant(-1, dtype=tf.int64)
+
+TFDS_SPLIT_NAME = {'train': 'train', 'eval_train': 'train', 'validation': 'val'}
+
+SEQUENCE_LENGTH = 1024
+MAX_CORPUS_CHARS = 1_000_000_000
+SHUFFLE_BUFFER_SIZE = 1000
+VOCAB_SIZE = 50_257
+
+
+def batch_with_padding(
+  dataset: tf.data.Dataset,
+  batch_size,
+  padded_shapes=None,
+  padding_id=PAD_ID,
+):
+  """Batches a tf.data.Dataset and adds padding if len(dataset) is not divisible by the batch size.
+
+  Args:
+    dataset: tf.data.Dataset
+    batch_size: batch size of resulting batched dataset
+    padded_shapes: shapes of the padded batches
+    padding_id: value for padding, for elements in new batch
+
+  Returns:
+  """
+  batched_dataset = dataset.batch(batch_size, drop_remainder=False)
+
+  # tf.data.Dataset.padded.batch pads elements in the batch so we call it
+  # again with batch_size=1 to pad each element in original batch.
+  padded_batched_dataset = batched_dataset.padded_batch(
+    1, padded_shapes=padded_shapes, padding_values=padding_id
+  )
+
+  # Remove extra dimension resulting from the batch_size=1.
+  padded_batched_dataset = padded_batched_dataset.unbatch()
+
+  return padded_batched_dataset
+
+
+def get_data_iter(
+  data_rng: jax.random.PRNGKey,
+  split: str,
+  data_dir: str,
+  batch_size: int,
+  num_batches: Optional[int] = None,
+):
+  ds = get_lm_dataset(data_rng, split, data_dir, batch_size, num_batches)
+
+  it = map(
+    functools.partial(
+      data_utils.shard_and_maybe_pad_np, global_batch_size=batch_size
+    ),
+    ds,
+  )
+
+  return iter(it)
+
+
+def get_lm_dataset(
+  data_rng: jax.random.PRNGKey,
+  split: str,
+  data_dir: str,
+  batch_size: int,
+  num_batches: Optional[int] = None,
+):
+  """Load preprocessed TF dataset."""
+  if split not in TFDS_SPLIT_NAME:
+    raise NotImplementedError
+
+  shuffle_seed = jax.random.randint(data_rng, (), -(2**31), 2**31 - 1)
+
+  data_dir = os.path.join(data_dir, TFDS_SPLIT_NAME[split])
+  tokens_ds = tf.data.Dataset.load(data_dir)
+
+  # tokens
+  tokens_ds = tokens_ds.flat_map(tf.data.Dataset.from_tensor_slices)
+
+  # sequences
+  sequences_ds = tokens_ds.batch(SEQUENCE_LENGTH + 1, drop_remainder=True)
+
+  # get inputs and outputs
+  sequences_ds = sequences_ds.map(
+    lambda x: {
+      'inputs': x['input_ids'][:SEQUENCE_LENGTH],
+      'targets': x['input_ids'][1:],
+    },
+    num_parallel_calls=AUTOTUNE,
+  )
+  if split == 'train':
+    ds = sequences_ds.shuffle(SHUFFLE_BUFFER_SIZE, seed=shuffle_seed)
+    ds = ds.batch(batch_size, drop_remainder=False)
+    ds = ds.take(num_batches) if num_batches is not None else ds
+    ds = ds.repeat()
+    ds = ds.map(
+      lambda x: {
+        'inputs': x['inputs'],
+        'targets': x['targets'],
+        'weights': None,
+      }
+    )
+    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
+  elif split == 'eval_train':
+    ds = batch_with_padding(
+      sequences_ds,
+      batch_size,
+      padded_shapes={
+        'inputs': (batch_size, None),
+        'targets': (batch_size, None),
+      },
+    )
+    ds = ds.take(num_batches) if num_batches is not None else ds
+    ds = ds.repeat()
+    ds = ds.map(
+      lambda x: {
+        'inputs': x['inputs'],
+        'targets': x['targets'],
+        'weights': tf.where(tf.equal(x['inputs'], PAD_ID), 0.0, 1.0),
+      }
+    )
+    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
+  elif split == 'validation':
+    ds = batch_with_padding(
+      sequences_ds,
+      batch_size,
+      padded_shapes={
+        'inputs': (batch_size, None),
+        'targets': (batch_size, None),
+      },
+    )
+    ds = ds.take(num_batches) if num_batches is not None else ds
+    ds = ds.repeat()
+    ds = ds.map(
+      lambda x: {
+        'inputs': x['inputs'],
+        'targets': x['targets'],
+        'weights': tf.where(tf.equal(x['inputs'], PAD_ID), 0.0, 1.0),
+      }
+    )
+    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
+  return ds