Skip to content

Commit edcbf8a

Browse files
chzblychZhanruiSunCh
authored andcommitted
[None][infra] Avoid intermittent access broken to nvcr.io (NVIDIA#6715)
Signed-off-by: Yanchao Lu <[email protected]> Co-authored-by: Zhanrui Sun <[email protected]> Signed-off-by: Wangshanshan <[email protected]>
1 parent d33e35c commit edcbf8a

File tree

6 files changed

+44
-35
lines changed

6 files changed

+44
-35
lines changed

docker/Makefile

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# Default base image for the docker build as defined in Dockerfile.multi
22
BASE_IMAGE ?= $(shell grep '^ARG BASE_IMAGE=' Dockerfile.multi | grep -o '=.*' | tr -d '="')
33
BASE_TAG ?= $(shell grep '^ARG BASE_TAG=' Dockerfile.multi | grep -o '=.*' | tr -d '="')
4+
TRITON_IMAGE ?= $(shell grep '^ARG TRITON_IMAGE=' Dockerfile.multi | grep -o '=.*' | tr -d '="')
5+
TRITON_BASE_TAG ?= $(shell grep '^ARG TRITON_BASE_TAG=' Dockerfile.multi | grep -o '=.*' | tr -d '="')
46
# Name of the new image
57
IMAGE_NAME ?= tensorrt_llm
68
IMAGE_TAG ?= latest
@@ -80,6 +82,8 @@ endef
8082
--progress $(DOCKER_PROGRESS) \
8183
$(if $(BASE_IMAGE), --build-arg BASE_IMAGE=$(BASE_IMAGE)) \
8284
$(if $(BASE_TAG), --build-arg BASE_TAG=$(BASE_TAG)) \
85+
$(if $(TRITON_IMAGE), --build-arg TRITON_IMAGE=$(TRITON_IMAGE)) \
86+
$(if $(TRITON_BASE_TAG), --build-arg TRITON_BASE_TAG=$(TRITON_BASE_TAG)) \
8387
$(if $(BUILD_WHEEL_ARGS), --build-arg BUILD_WHEEL_ARGS="$(BUILD_WHEEL_ARGS)") \
8488
$(if $(BUILD_WHEEL_SCRIPT), --build-arg BUILD_WHEEL_SCRIPT="$(BUILD_WHEEL_SCRIPT)") \
8589
$(if $(TORCH_INSTALL_TYPE), --build-arg TORCH_INSTALL_TYPE="$(TORCH_INSTALL_TYPE)") \
@@ -187,16 +191,16 @@ jenkins-aarch64_%: STAGE = tritondevel
187191
jenkins-rockylinux8_%: PYTHON_VERSION_TAG_ID = $(if $(findstring 3.12,${PYTHON_VERSION}),PY312,$(if $(findstring 3.10,${PYTHON_VERSION}),PY310,$(error Unknown PYTHON_VERSION specified)))
188192
jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell . ../jenkins/current_image_tags.properties && echo $$LLM_ROCKYLINUX8_${PYTHON_VERSION_TAG_ID}_DOCKER_IMAGE)
189193
jenkins-rockylinux8_%: STAGE = tritondevel
190-
jenkins-rockylinux8_%: BASE_IMAGE = nvidia/cuda
194+
jenkins-rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda
191195
jenkins-rockylinux8_%: BASE_TAG = 12.9.1-devel-rockylinux8
192196

193197
rockylinux8_%: STAGE = tritondevel
194-
rockylinux8_%: BASE_IMAGE = nvidia/cuda
198+
rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda
195199
rockylinux8_%: BASE_TAG = 12.9.1-devel-rockylinux8
196200

197201
# For x86_64 and aarch64
198202
ubuntu22_%: STAGE = tritondevel
199-
ubuntu22_%: BASE_IMAGE = nvidia/cuda
203+
ubuntu22_%: BASE_IMAGE = nvcr.io/nvidia/cuda
200204
ubuntu22_%: BASE_TAG = 12.9.1-devel-ubuntu22.04
201205

202206
trtllm_%: STAGE = release

jenkins/BuildDockerImage.groovy

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -281,23 +281,30 @@ def buildImage(config, imageKeyToTag)
281281
try {
282282
def build_jobs = BUILD_JOBS
283283
// Fix the triton image pull timeout issue
284+
def BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG BASE_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
284285
def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
285286
def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
286287

288+
if (target == "rockylinux8") {
289+
BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'jenkins-rockylinux8_%: BASE_IMAGE =' docker/Makefile | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
290+
}
291+
292+
// Replace the base image and triton image with the internal mirror
293+
BASE_IMAGE = BASE_IMAGE.replace("nvcr.io/", "urm.nvidia.com/docker/")
294+
TRITON_IMAGE = TRITON_IMAGE.replace("nvcr.io/", "urm.nvidia.com/docker/")
295+
287296
if (dependent) {
288297
stage ("make ${dependent.target}_${action} (${arch})") {
289-
retry(3) {
290-
sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}"
291-
}
292-
retry(3) {
293-
sh """
294-
cd ${LLM_ROOT} && make -C docker ${dependent.target}_${action} \
295-
TORCH_INSTALL_TYPE=${torchInstallType} \
296-
IMAGE_WITH_TAG=${dependentImageWithTag} \
297-
STAGE=${dependent.dockerfileStage} \
298-
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
299-
"""
300-
}
298+
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: 300, shortCommondRunTimeMax: 7200)
299+
trtllm_utils.llmExecStepWithRetry(this, script: """
300+
cd ${LLM_ROOT} && make -C docker ${dependent.target}_${action} \
301+
BASE_IMAGE=${BASE_IMAGE} \
302+
TRITON_IMAGE=${TRITON_IMAGE} \
303+
TORCH_INSTALL_TYPE=${torchInstallType} \
304+
IMAGE_WITH_TAG=${dependentImageWithTag} \
305+
STAGE=${dependent.dockerfileStage} \
306+
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
307+
""", sleepInSecs: 300, shortCommondRunTimeMax: 7200)
301308
args += " DEVEL_IMAGE=${dependentImageWithTag}"
302309
if (target == "ngc-release") {
303310
imageKeyToTag["NGC Devel Image ${config.arch}"] = dependentImageWithTag
@@ -315,18 +322,16 @@ def buildImage(config, imageKeyToTag)
315322
}
316323
}
317324
stage ("make ${target}_${action} (${arch})") {
318-
retry(3) {
319-
sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}"
320-
}
321-
retry(3) {
322-
sh """
323-
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
324-
TORCH_INSTALL_TYPE=${torchInstallType} \
325-
IMAGE_WITH_TAG=${imageWithTag} \
326-
STAGE=${dockerfileStage} \
327-
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
328-
"""
329-
}
325+
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: 300, shortCommondRunTimeMax: 7200)
326+
trtllm_utils.llmExecStepWithRetry(this, script: """
327+
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
328+
BASE_IMAGE=${BASE_IMAGE} \
329+
TRITON_IMAGE=${TRITON_IMAGE} \
330+
TORCH_INSTALL_TYPE=${torchInstallType} \
331+
IMAGE_WITH_TAG=${imageWithTag} \
332+
STAGE=${dockerfileStage} \
333+
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
334+
""", sleepInSecs: 300, shortCommondRunTimeMax: 7200)
330335
if (target == "ngc-release") {
331336
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
332337
}
@@ -336,6 +341,8 @@ def buildImage(config, imageKeyToTag)
336341
stage ("custom tag: ${customTag} (${arch})") {
337342
sh """
338343
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
344+
BASE_IMAGE=${BASE_IMAGE} \
345+
TRITON_IMAGE=${TRITON_IMAGE} \
339346
TORCH_INSTALL_TYPE=${torchInstallType} \
340347
IMAGE_WITH_TAG=${customImageWithTag} \
341348
STAGE=${dockerfileStage} \

jenkins/L0_Test.groovy

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310
3939
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312
4040

4141
// DLFW torch image
42-
DLFW_IMAGE = "nvcr.io/nvidia/pytorch:25.06-py3"
42+
DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.06-py3"
4343

4444
//Ubuntu base image
4545
UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04"

jenkins/current_image_tags.properties

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#
1212
# NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that
1313
# images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
14-
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508051130-6090
15-
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508051130-6090
16-
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508051130-6090
17-
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508051130-6090
14+
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508110900-6715
15+
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508110900-6715
16+
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508110900-6715
17+
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508110900-6715

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,5 +219,6 @@ l0_dgx_h100:
219219
terms:
220220
stage: post_merge
221221
backend: triton
222+
auto_trigger: others
222223
tests:
223224
- triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm]

tests/unittest/test_pip_install.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,6 @@ def test_pip_install():
5151
help="The wheel path")
5252
args = parser.parse_args()
5353

54-
if not os.environ.get("CUDA_HOME"):
55-
os.environ["CUDA_HOME"] = "/usr/local/cuda"
56-
5754
print("########## Install required system libs ##########")
5855
if not os.path.exists("/usr/local/mpi/bin/mpicc"):
5956
subprocess.check_call("apt-get -y install libopenmpi-dev", shell=True)

0 commit comments

Comments
 (0)