|
37 | 37 | from .llm_utils import (CachedModelLoader, KvCacheRetentionConfig,
|
38 | 38 | LlmBuildStats, ModelLoader, _ModelRuntimeContext)
|
39 | 39 | from .mpi_session import MpiPoolSession, external_mpi_comm_available
|
40 |
| -from .tokenizer import (TokenizerBase, _xgrammar_tokenizer_info) |
| 40 | +from .tokenizer import TokenizerBase, _xgrammar_tokenizer_info |
41 | 41 | # TODO[chunweiy]: move the following symbols back to utils scope, and remove the following import
|
42 | 42 | from .utils import (append_docstring, exception_handler, get_device_count,
|
43 | 43 | print_colored_debug, set_api_status)
|
@@ -959,27 +959,8 @@ def _build_model(self):
|
959 | 959 |
|
960 | 960 | assert isinstance(self.args, TorchLlmArgs)
|
961 | 961 |
|
962 |
| - # self._executor_config = tllm.ExecutorConfig( |
963 |
| - # max_beam_width=self.args.max_beam_width, |
964 |
| - # scheduler_config=PybindMirror.maybe_to_pybind( |
965 |
| - # self.args.scheduler_config), |
966 |
| - # max_batch_size=self.args.max_batch_size, |
967 |
| - # max_num_tokens=self.args.max_num_tokens, |
968 |
| - # gather_generation_logits=self.args.gather_generation_logits, |
969 |
| - # fail_fast_on_attention_window_too_large=getattr( |
970 |
| - # self.args, 'fail_fast_on_attention_window_too_large', False), |
971 |
| - # **kwargs) |
972 |
| - |
973 |
| - # self._executor_config = self.args.get_executor_config(self._hf_model_dir) |
974 |
| - |
975 | 962 | # TODO: revisit gather_context_logits
|
976 | 963 | return_logits = self.args.gather_generation_logits
|
977 |
| - |
978 |
| - print("---- self._executor_cls is: {}".format(self._executor_cls), |
979 |
| - flush=True) |
980 |
| - print("---- self._engine_dir is: {}".format(self._engine_dir), |
981 |
| - flush=True) |
982 |
| - |
983 | 964 | self._executor = self._executor_cls.create(
|
984 | 965 | self._engine_dir,
|
985 | 966 | executor_config=None,
|
|
0 commit comments