Skip to content

Commit 3a98789

Browse files
[TRTLLM-7141][infra] Use repo mirrors to avoid intermittent network failures (#6836)
Signed-off-by: Yanchao Lu <[email protected]> Co-authored-by: Zhanrui Sun <[email protected]>
1 parent e54ba75 commit 3a98789

File tree

6 files changed

+62
-34
lines changed

6 files changed

+62
-34
lines changed

cpp/CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,17 @@ if(ENABLE_UCX)
495495
if(NOT ${ucx_FOUND})
496496
set(ENABLE_UCX 0)
497497
else()
498+
if(DEFINED ENV{GITHUB_MIRROR} AND NOT "$ENV{GITHUB_MIRROR}" STREQUAL "")
499+
if(EXISTS "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake")
500+
file(READ "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake" FILE_CONTENTS)
501+
string(
502+
REPLACE "https://raw.githubusercontent.com/rapidsai/rapids-cmake"
503+
"$ENV{GITHUB_MIRROR}/rapidsai/rapids-cmake/raw/refs/heads"
504+
FILE_CONTENTS "${FILE_CONTENTS}")
505+
file(WRITE "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake" "${FILE_CONTENTS}")
506+
message(WARNING "Replace UCXX fetch_rapids.cmake with internal mirror")
507+
endif()
508+
endif()
498509
# installing ucxx via add_subdirectory results in strange cudart linking
499510
# error, thus using their installation script to isolate the installation
500511
# process until the issue is understood. And always trigger the build so

docker/Makefile

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# Default base image for the docker build as defined in Dockerfile.multi
22
BASE_IMAGE ?= $(shell grep '^ARG BASE_IMAGE=' Dockerfile.multi | grep -o '=.*' | tr -d '="')
33
BASE_TAG ?= $(shell grep '^ARG BASE_TAG=' Dockerfile.multi | grep -o '=.*' | tr -d '="')
4+
TRITON_IMAGE ?= $(shell grep '^ARG TRITON_IMAGE=' Dockerfile.multi | grep -o '=.*' | tr -d '="')
5+
TRITON_BASE_TAG ?= $(shell grep '^ARG TRITON_BASE_TAG=' Dockerfile.multi | grep -o '=.*' | tr -d '="')
46
# Name of the new image
57
IMAGE_NAME ?= tensorrt_llm
68
IMAGE_TAG ?= latest
@@ -80,6 +82,8 @@ endef
8082
--progress $(DOCKER_PROGRESS) \
8183
$(if $(BASE_IMAGE), --build-arg BASE_IMAGE=$(BASE_IMAGE)) \
8284
$(if $(BASE_TAG), --build-arg BASE_TAG=$(BASE_TAG)) \
85+
$(if $(TRITON_IMAGE), --build-arg TRITON_IMAGE=$(TRITON_IMAGE)) \
86+
$(if $(TRITON_BASE_TAG), --build-arg TRITON_BASE_TAG=$(TRITON_BASE_TAG)) \
8387
$(if $(BUILD_WHEEL_ARGS), --build-arg BUILD_WHEEL_ARGS="$(BUILD_WHEEL_ARGS)") \
8488
$(if $(BUILD_WHEEL_SCRIPT), --build-arg BUILD_WHEEL_SCRIPT="$(BUILD_WHEEL_SCRIPT)") \
8589
$(if $(TORCH_INSTALL_TYPE), --build-arg TORCH_INSTALL_TYPE="$(TORCH_INSTALL_TYPE)") \
@@ -187,16 +191,16 @@ jenkins-aarch64_%: STAGE = tritondevel
187191
jenkins-rockylinux8_%: PYTHON_VERSION_TAG_ID = $(if $(findstring 3.12,${PYTHON_VERSION}),PY312,$(if $(findstring 3.10,${PYTHON_VERSION}),PY310,$(error Unknown PYTHON_VERSION specified)))
188192
jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell . ../jenkins/current_image_tags.properties && echo $$LLM_ROCKYLINUX8_${PYTHON_VERSION_TAG_ID}_DOCKER_IMAGE)
189193
jenkins-rockylinux8_%: STAGE = tritondevel
190-
jenkins-rockylinux8_%: BASE_IMAGE = nvidia/cuda
194+
jenkins-rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda
191195
jenkins-rockylinux8_%: BASE_TAG = 12.9.1-devel-rockylinux8
192196

193197
rockylinux8_%: STAGE = tritondevel
194-
rockylinux8_%: BASE_IMAGE = nvidia/cuda
198+
rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda
195199
rockylinux8_%: BASE_TAG = 12.9.1-devel-rockylinux8
196200

197201
# For x86_64 and aarch64
198202
ubuntu22_%: STAGE = tritondevel
199-
ubuntu22_%: BASE_IMAGE = nvidia/cuda
203+
ubuntu22_%: BASE_IMAGE = nvcr.io/nvidia/cuda
200204
ubuntu22_%: BASE_TAG = 12.9.1-devel-ubuntu22.04
201205

202206
trtllm_%: STAGE = release

jenkins/BuildDockerImage.groovy

Lines changed: 37 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ def buildImage(config, imageKeyToTag)
258258
// Step 2: Build the images
259259
stage ("Install packages") {
260260
sh "pwd && ls -alh"
261-
sh "env"
261+
sh "env | sort"
262262
sh "apk add make git"
263263
sh "git config --global --add safe.directory '*'"
264264

@@ -281,23 +281,31 @@ def buildImage(config, imageKeyToTag)
281281
try {
282282
def build_jobs = BUILD_JOBS
283283
// Fix the triton image pull timeout issue
284-
def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
285-
def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
284+
def BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^ARG BASE_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
285+
def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
286+
def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep '^ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
287+
288+
if (target == "rockylinux8") {
289+
BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^jenkins-rockylinux8_%: BASE_IMAGE =' docker/Makefile | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
290+
}
291+
292+
// Replace the base image and triton image with the internal mirror
293+
BASE_IMAGE = BASE_IMAGE.replace("nvcr.io/", "urm.nvidia.com/docker/")
294+
TRITON_IMAGE = TRITON_IMAGE.replace("nvcr.io/", "urm.nvidia.com/docker/")
286295

287296
if (dependent) {
288297
stage ("make ${dependent.target}_${action} (${arch})") {
289-
retry(3) {
290-
sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}"
291-
}
292-
retry(3) {
293-
sh """
294-
cd ${LLM_ROOT} && make -C docker ${dependent.target}_${action} \
295-
TORCH_INSTALL_TYPE=${torchInstallType} \
296-
IMAGE_WITH_TAG=${dependentImageWithTag} \
297-
STAGE=${dependent.dockerfileStage} \
298-
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
299-
"""
300-
}
298+
def randomSleep = (Math.random() * 300 + 300).toInteger()
299+
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, shortCommondRunTimeMax: 7200)
300+
trtllm_utils.llmExecStepWithRetry(this, script: """
301+
cd ${LLM_ROOT} && make -C docker ${dependent.target}_${action} \
302+
BASE_IMAGE=${BASE_IMAGE} \
303+
TRITON_IMAGE=${TRITON_IMAGE} \
304+
TORCH_INSTALL_TYPE=${torchInstallType} \
305+
IMAGE_WITH_TAG=${dependentImageWithTag} \
306+
STAGE=${dependent.dockerfileStage} \
307+
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
308+
""", sleepInSecs: randomSleep, numRetries: 3, shortCommondRunTimeMax: 7200)
301309
args += " DEVEL_IMAGE=${dependentImageWithTag}"
302310
if (target == "ngc-release") {
303311
imageKeyToTag["NGC Devel Image ${config.arch}"] = dependentImageWithTag
@@ -315,18 +323,18 @@ def buildImage(config, imageKeyToTag)
315323
}
316324
}
317325
stage ("make ${target}_${action} (${arch})") {
318-
retry(3) {
319-
sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}"
320-
}
321-
retry(3) {
322-
sh """
323-
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
324-
TORCH_INSTALL_TYPE=${torchInstallType} \
325-
IMAGE_WITH_TAG=${imageWithTag} \
326-
STAGE=${dockerfileStage} \
327-
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
328-
"""
329-
}
326+
sh "env | sort"
327+
def randomSleep = (Math.random() * 300 + 300).toInteger()
328+
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, shortCommondRunTimeMax: 7200)
329+
trtllm_utils.llmExecStepWithRetry(this, script: """
330+
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
331+
BASE_IMAGE=${BASE_IMAGE} \
332+
TRITON_IMAGE=${TRITON_IMAGE} \
333+
TORCH_INSTALL_TYPE=${torchInstallType} \
334+
IMAGE_WITH_TAG=${imageWithTag} \
335+
STAGE=${dockerfileStage} \
336+
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
337+
""", sleepInSecs: randomSleep, numRetries: 3, shortCommondRunTimeMax: 7200)
330338
if (target == "ngc-release") {
331339
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
332340
}
@@ -336,6 +344,8 @@ def buildImage(config, imageKeyToTag)
336344
stage ("custom tag: ${customTag} (${arch})") {
337345
sh """
338346
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
347+
BASE_IMAGE=${BASE_IMAGE} \
348+
TRITON_IMAGE=${TRITON_IMAGE} \
339349
TORCH_INSTALL_TYPE=${torchInstallType} \
340350
IMAGE_WITH_TAG=${customImageWithTag} \
341351
STAGE=${dockerfileStage} \

jenkins/L0_Test.groovy

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310
3939
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312
4040

4141
// DLFW torch image
42-
DLFW_IMAGE = "nvcr.io/nvidia/pytorch:25.06-py3"
42+
DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.06-py3"
4343

4444
//Ubuntu base image
4545
UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04"
@@ -2075,6 +2075,11 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
20752075
checkPipStage = true
20762076
}
20772077

2078+
if (cpu_arch == AARCH64_TRIPLE && values[5] != DLFW_IMAGE) {
2079+
checkPipStage = false
2080+
echo "Skip pip install sanity check due to https://nvbugs/5453827"
2081+
}
2082+
20782083
if (checkPipStage) {
20792084
stage("Run LLMAPI tests") {
20802085
pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch)

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,5 +217,6 @@ l0_dgx_h100:
217217
terms:
218218
stage: post_merge
219219
backend: triton
220+
auto_trigger: others
220221
tests:
221222
- triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm]

tests/unittest/test_pip_install.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,6 @@ def test_pip_install():
5151
help="The wheel path")
5252
args = parser.parse_args()
5353

54-
if not os.environ.get("CUDA_HOME"):
55-
os.environ["CUDA_HOME"] = "/usr/local/cuda"
56-
5754
print("########## Install required system libs ##########")
5855
if not os.path.exists("/usr/local/mpi/bin/mpicc"):
5956
subprocess.check_call("apt-get -y install libopenmpi-dev", shell=True)

0 commit comments

Comments
 (0)