local changes

2ez4bz · 2ez4bz · commit db267e3099cd · 2025-08-04T10:50:26.000-07:00
diff --git a/docker/Makefile b/docker/Makefile
@@ -120,7 +120,7 @@ endef
 DOCKER_RUN_OPTS ?= --rm -it --ipc=host --ulimit stack=67108864 $(if $(filter 0,$(IS_ROOTLESS)),--ulimit memlock=-1)
 DOCKER_RUN_ARGS   ?=
 # Check if NVIDIA_VISIBLE_DEVICES is set and not empty
-NVIDIA_VISIBLE_DEVICES_VAL = $(shell echo $$NVIDIA_VISIBLE_DEVICES)
+NVIDIA_VISIBLE_DEVICES_VAL = $(shell echo $$NV_GPU)
 ifeq ($(NVIDIA_VISIBLE_DEVICES_VAL),)
   # If empty or not set, use all GPUs
   GPU_OPTS ?= --gpus=all
@@ -149,17 +149,23 @@ ifeq ($(LOCAL_USER),1)
 	$(call add_local_user,$(IMAGE_WITH_TAG))
 endif
 	docker run $(DOCKER_RUN_OPTS) $(DOCKER_RUN_ARGS) \
+    		--network=host \
     		$(GPU_OPTS) \
     		--volume $(SOURCE_DIR):$(CODE_DIR) \
     		$(if $(and $(filter 1,$(LOCAL_USER)),$(shell [ -w "$(USER_CACHE_DIR)" ] && echo 1)),--volume $(USER_CACHE_DIR):/home/$(USER_NAME)/.cache:rw) \
     		--env "CCACHE_DIR=$(CCACHE_DIR)" \
     		--env "CCACHE_BASEDIR=$(CODE_DIR)" \
     		--env "CONAN_HOME=$(CONAN_DIR)" \
+    		--env "HF_HOME=/home/scratch.williamz_gpu/code/trtc/builder/hf_cache" \
+    		--volume /home/scratch.trt_llm_data:/home/scratch.trt_llm_data \
+    		--volume /home/scratch.williamz_gpu:/home/scratch.williamz_gpu \
     		--workdir $(WORK_DIR) \
     		--hostname $(shell hostname)-$* \
     		--name $(CONTAINER_NAME)-$*-$(USER_NAME) \
     		--tmpfs /tmp:exec \
     		$(IMAGE_WITH_TAG)$(IMAGE_TAG_SUFFIX) $(RUN_CMD)
+    # 		$(if $(filter 1,$(LOCAL_USER)),--volume ${HOME_DIR}/.cache:/home/${USER_NAME}/.cache:rw) \
+    # 		--env TLLM_LLMAPI_BUILD_CACHE_ROOT=/home/scratch.williamz_gpu/trtllm_llmapi_cache \
 
 devel_%: STAGE = devel
 tritondevel_%: STAGE = tritondevel
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/weight_loader.py b/tensorrt_llm/_torch/models/checkpoints/hf/weight_loader.py
@@ -25,6 +25,9 @@ class HfWeightLoader(BaseWeightLoader):
 
     def load_weights(self, checkpoint_dir: str) -> dict[str, Any]:
         weight_files = glob.glob(f"{checkpoint_dir}/*.safetensors")
+        print(f"==== BEFORE: {len(weight_files)}")
+        weight_files = [x for x in weight_files if "consolidated" not in x]
+        print(f"==== AFTER: {len(weight_files)}")
         if weight_files:
             # Prefetch the weight files to CPU memory if the size is less than 90% of the available memory.
             # This is a heuristic to avoid prefetching files that are too large and causing file cache thrashing.
diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py
@@ -685,8 +685,13 @@ def from_json_file(cls, config_file, plugin_config=None):
     def to_dict(self):
         output = copy.deepcopy(self.__dict__)
         # the enum KVCacheType cannot be converted automatically
-        if output.get('kv_cache_type', None) is not None:
-            output['kv_cache_type'] = str(output['kv_cache_type'].name)
+        if (kv_cache_type := output.get('kv_cache_type', None)) is not None:
+            if isinstance(kv_cache_type, KVCacheType):
+                output['kv_cache_type'] = str(kv_cache_type.name)
+            elif isinstance(kv_cache_type, str):
+                output['kv_cache_type'] = kv_cache_type
+            else:
+                raise TypeError(f"Unknown type: {type(kv_cache_type)}")
         output['plugin_config'] = output['plugin_config'].to_dict()
         output['lora_config'] = output['lora_config'].to_dict()
         output['auto_parallel_config'] = output['auto_parallel_config'].to_dict(