Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,6 @@ def __init__(self, model_args: Qwen3ModelArgs):
self.model_args = model_args
self.vocab_size = model_args.vocab_size
self.n_layers = model_args.n_layers
self.eos_id = model_args.eos_id
self.head_dim = model_args.head_dim

self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
Expand Down
1 change: 0 additions & 1 deletion torchtitan/experiments/deterministic_vllm_rl/simple_rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,6 @@ def load_model(checkpoint_path: str, model_path: str, use_vllm_compat: bool = Tr
max_seq_len=getattr(hf_config, "max_position_embeddings", 32768),
qk_norm=True,
depth_init=True,
eos_id=getattr(hf_config, "eos_token_id", 151645),
)

# state_dict is in standard TorchTitan format (w1, w2, w3)
Expand Down
1 change: 0 additions & 1 deletion torchtitan/experiments/transformers_backend/model/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
"n_kv_heads": "num_key_value_heads",
"norm_eps": "rms_norm_eps",
"max_seq_len": "max_position_embeddings",
"eos_id": "eos_token_id",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems a different usage of eos_id, let's revert this change

}
}

Expand Down
1 change: 0 additions & 1 deletion torchtitan/models/llama3/model/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ class TransformerModelArgs(BaseModelArgs):

use_flex_attn: bool = False
attn_mask_type: str = "causal"
eos_id: int = 0

def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
seq_len = job_config.training.seq_len
Expand Down
1 change: 0 additions & 1 deletion torchtitan/models/qwen3/model/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ class Qwen3ModelArgs(BaseModelArgs):

use_flex_attn: bool = False
attn_mask_type: str = "causal"
eos_id: int = 151645

enable_weight_tying: bool = False

Expand Down
1 change: 0 additions & 1 deletion torchtitan/models/qwen3/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,6 @@ def __init__(self, model_args: Qwen3ModelArgs):
self.model_args = model_args
self.vocab_size = model_args.vocab_size
self.n_layers = model_args.n_layers
self.eos_id = model_args.eos_id
self.head_dim = model_args.head_dim

self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
Expand Down
31 changes: 31 additions & 0 deletions torchtitan/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,3 +468,34 @@ def get_moe_model_nparams_and_flops(
nparams = nparams - nparams_embedding

return nparams, num_flops_per_token


def validate_tokenizer_model_alignment(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe call it

Suggested change
def validate_tokenizer_model_alignment(
def validate_tokenizer_model_compatibility(

since we no longer require them to be identical

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tianyu-l That makes sense. I’ve updated the function name accordingly, and also reverted the removal of eos_id. Thanks for the suggestion!

tokenizer: "BaseTokenizer | None",
model_args: "BaseModelArgs",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The type hint field should not be surrounded by quotation mark?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wwwjn Thanks for pointing this out! I originally wrote it this way to avoid potential circular import issues. (like https://github.com/pytorch/torchtitan/blob/main/torchtitan/components/metrics.py#L496)

However, after testing on Python 3.10 and above, it seems to work fine without the quotation marks, so I’ve updated the code accordingly.

) -> None:
"""
Validate that tokenizer configuration is compatible with model configuration.

Args:
tokenizer: Tokenizer instance to validate. Can be None.
model_args: Model arguments object containing configuration to validate against.

Raises:
ValueError: If tokenizer vocab_size exceeds model vocab_size, which would
cause index out of bounds errors during training.
"""
if tokenizer is None:
return

if hasattr(model_args, "vocab_size"):
tokenizer_vocab_size = tokenizer.get_vocab_size()
model_vocab_size = model_args.vocab_size
if model_vocab_size < tokenizer_vocab_size:
raise ValueError(
f"Model vocab_size ({model_vocab_size}) is smaller than "
f"tokenizer vocab_size ({tokenizer_vocab_size}). "
f"This will cause index out of bounds errors during training. "
f"The model's embedding layer must be at least as large as the "
f"tokenizer's vocabulary size."
)
3 changes: 3 additions & 0 deletions torchtitan/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
)
from torchtitan.config import ConfigManager, JobConfig, TORCH_DTYPE_MAP
from torchtitan.distributed import ParallelDims, utils as dist_utils
from torchtitan.models.utils import validate_tokenizer_model_alignment
from torchtitan.protocols.model_converter import build_model_converters
from torchtitan.tools import utils
from torchtitan.tools.logging import init_logger, logger
Expand Down Expand Up @@ -134,6 +135,8 @@ def __init__(self, job_config: JobConfig):
model_args.update_from_config(job_config)
self.model_args = model_args

validate_tokenizer_model_alignment(self.tokenizer, model_args)

logger.info(
f"Building {job_config.model.name} {job_config.model.flavor} with {model_args}"
)
Expand Down