11#!/usr/bin/env python3
22
3- # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+ # Copyright 2024-2025 , NVIDIA CORPORATION & AFFILIATES. All rights reserved.
44#
55# Redistribution and use in source and binary forms, with or without
66# modification, are permitted provided that the following conditions
@@ -95,12 +95,20 @@ def parse_args():
9595 required = True ,
9696 help = "Path to the Triton model repository holding the models to be served" ,
9797 )
98+ # TODO: determine what to do with single tokenizer flag
9899 triton_group .add_argument (
99100 "--tokenizer" ,
100101 type = str ,
101102 default = None ,
102103 help = "HuggingFace ID or local folder path of the Tokenizer to use for chat templates" ,
103104 )
105+ triton_group .add_argument (
106+ "--tokenizers" ,
107+ type = str ,
108+ nargs = "+" , # Accept multiple arguments
109+ default = [],
110+ help = "List of HuggingFace IDs or local folder paths of Tokenizers to use. Format: model_name:tokenizer_path" ,
111+ )
104112 triton_group .add_argument (
105113 "--backend" ,
106114 type = str ,
@@ -160,8 +168,22 @@ def parse_args():
160168def main ():
161169 args = parse_args ()
162170
163- # Initialize a Triton Inference Server pointing at LLM models
164- server : tritonserver .Server = tritonserver .Server (
171+ # Parse tokenizer mappings
172+ tokenizer_map = {}
173+ for tokenizer_spec in args .tokenizers :
174+ try :
175+ model_name , tokenizer_path = tokenizer_spec .split (":" )
176+ tokenizer_map [model_name ] = tokenizer_path
177+ except ValueError :
178+ print (
179+ f"Warning: Skipping invalid tokenizer specification: { tokenizer_spec } . Format should be 'model_name:tokenizer_path'"
180+ )
181+
182+ if args .tokenizer :
183+ tokenizer_map ["default" ] = args .tokenizer
184+
185+ # Initialize Triton server
186+ server = tritonserver .Server (
165187 model_repository = args .model_repository ,
166188 log_verbose = args .tritonserver_log_verbose_level ,
167189 log_info = True ,
@@ -170,8 +192,8 @@ def main():
170192 ).start (wait_until_ready = True )
171193
172194 # Wrap Triton Inference Server in an interface-conforming "LLMEngine"
173- engine : TritonLLMEngine = TritonLLMEngine (
174- server = server , tokenizer = args . tokenizer , backend = args .backend
195+ engine = TritonLLMEngine (
196+ server = server , tokenizer_map = tokenizer_map , backend = args .backend
175197 )
176198
177199 # Attach TritonLLMEngine as the backbone for inference and model management
0 commit comments