Skip to content

Commit

Permalink
Improve build reliability (#725)
Browse files Browse the repository at this point in the history
Separates the workflows for building the RAPIDS end user images and the cuVS images. The cuVS images do not depend on the RAPIDS end user images, so they can be built in parallel. This also allows for finer grained retries in case of failures.

Also switches to using `rapids-mamba-retry` for installing conda packages.

Finally, disables building the `cuvs-bench-datasets` images which are consistently failing (#724) until a better solution than the workaround in #723 is ready. 5adab54 can be reverted to re-enable this.

Authors:
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: #725
  • Loading branch information
raydouglass authored Jan 15, 2025
1 parent 4f0424d commit b2701d6
Show file tree
Hide file tree
Showing 10 changed files with 318 additions and 159 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,6 @@ on:
RAPIDS_VER:
required: true
type: string
BASE_TAG:
required: true
type: string
NOTEBOOKS_TAG:
required: true
type: string
CUVS_BENCH_TAG:
required: true
type: string
Expand Down Expand Up @@ -88,38 +82,6 @@ jobs:
with:
driver: docker
endpoint: builders
- name: Build base image
uses: docker/build-push-action@v6
with:
context: context
file: Dockerfile
target: base
push: true
pull: true
build-args: |
CUDA_VER=${{ inputs.CUDA_VER }}
LINUX_DISTRO=${{ inputs.LINUX_DISTRO }}
LINUX_DISTRO_VER=${{ inputs.LINUX_DISTRO_VER }}
LINUX_VER=${{ inputs.LINUX_VER }}
PYTHON_VER=${{ inputs.PYTHON_VER }}
RAPIDS_VER=${{ inputs.RAPIDS_VER }}
tags: ${{ inputs.BASE_TAG }}-${{ matrix.ARCH }}
- name: Build notebooks image
uses: docker/build-push-action@v6
with:
context: context
file: Dockerfile
target: notebooks
push: true
pull: true
build-args: |
CUDA_VER=${{ inputs.CUDA_VER }}
LINUX_DISTRO=${{ inputs.LINUX_DISTRO }}
LINUX_DISTRO_VER=${{ inputs.LINUX_DISTRO_VER }}
LINUX_VER=${{ inputs.LINUX_VER }}
PYTHON_VER=${{ inputs.PYTHON_VER }}
RAPIDS_VER=${{ inputs.RAPIDS_VER }}
tags: ${{ inputs.NOTEBOOKS_TAG }}-${{ matrix.ARCH }}
- name: Build cuVS Benchmarks GPU image
uses: docker/build-push-action@v6
with:
Expand All @@ -134,20 +96,20 @@ jobs:
PYTHON_VER=${{ inputs.PYTHON_VER }}
RAPIDS_VER=${{ inputs.RAPIDS_VER }}
tags: ${{ inputs.CUVS_BENCH_TAG }}-${{ matrix.ARCH }}
- name: Build cuVS Benchmarks GPU with datasets image
uses: docker/build-push-action@v6
with:
context: context
file: cuvs-bench/gpu/Dockerfile
target: cuvs-bench-datasets
push: true
pull: true
build-args: |
CUDA_VER=${{ inputs.CUDA_VER }}
LINUX_VER=${{ inputs.LINUX_VER }}
PYTHON_VER=${{ inputs.PYTHON_VER }}
RAPIDS_VER=${{ inputs.RAPIDS_VER }}
tags: ${{ inputs.CUVS_BENCH_DATASETS_TAG }}-${{ matrix.ARCH }}
# - name: Build cuVS Benchmarks GPU with datasets image
# uses: docker/build-push-action@v6
# with:
# context: context
# file: cuvs-bench/gpu/Dockerfile
# target: cuvs-bench-datasets
# push: true
# pull: true
# build-args: |
# CUDA_VER=${{ inputs.CUDA_VER }}
# LINUX_VER=${{ inputs.LINUX_VER }}
# PYTHON_VER=${{ inputs.PYTHON_VER }}
# RAPIDS_VER=${{ inputs.RAPIDS_VER }}
# tags: ${{ inputs.CUVS_BENCH_DATASETS_TAG }}-${{ matrix.ARCH }}
- name: Build cuVS Benchmarks CPU image
if: inputs.BUILD_CUVS_BENCH_CPU_IMAGE
uses: docker/build-push-action@v6
Expand Down
110 changes: 110 additions & 0 deletions .github/workflows/build-rapids-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
name: Build and push image variant

on:
workflow_call:
inputs:
ARCHES:
required: true
type: string
CUDA_VER:
required: true
type: string
LINUX_DISTRO:
required: true
type: string
LINUX_DISTRO_VER:
required: true
type: string
LINUX_VER:
required: true
type: string
PYTHON_VER:
required: true
type: string
RAPIDS_VER:
required: true
type: string
BASE_TAG:
required: true
type: string
NOTEBOOKS_TAG:
required: true
type: string

jobs:
build:
strategy:
matrix:
ARCH: ${{ fromJSON(inputs.ARCHES) }}
CUDA_VER: ["${{ inputs.CUDA_VER }}"]
LINUX_VER: ["${{ inputs.LINUX_VER }}"]
PYTHON_VER: ["${{ inputs.PYTHON_VER }}"]
RAPIDS_VER: ["${{ inputs.RAPIDS_VER }}"]
fail-fast: false
runs-on: "linux-${{ matrix.ARCH }}-cpu4"
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install gha-tools
run: |
mkdir -p /tmp/gha-tools
curl -s -L 'https://github.com/rapidsai/gha-tools/releases/latest/download/tools.tar.gz' | tar -xz -C /tmp/gha-tools
echo "/tmp/gha-tools" >> "${GITHUB_PATH}"
- name: Clean up condarc for release builds
run: |
GIT_DESCRIBE_TAG="$(git describe --tags --abbrev=0)"
GIT_DESCRIBE_TAG="${GIT_DESCRIBE_TAG:1}" # remove leading 'v'
if [[ ! $GIT_DESCRIBE_TAG =~ [a-z] ]]; then
rapids-logger "Most recent tag is for release, adding `rapidsai` channel and removing `rapidsai-nightly` and `dask/label/dev` channels."
sed -i 's|rapidsai-nightly|rapidsai|;\|dask/label/dev|d' context/condarc
else
rapids-logger "Most recent tag is an alpha. Build will use nightly channels."
fi
- name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
password: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
- name: Set up Docker Context for Buildx
id: buildx-context
run: |
docker context create builders
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
driver: docker
endpoint: builders
- name: Build base image
uses: docker/build-push-action@v6
with:
context: context
file: Dockerfile
target: base
push: true
pull: true
build-args: |
CUDA_VER=${{ inputs.CUDA_VER }}
LINUX_DISTRO=${{ inputs.LINUX_DISTRO }}
LINUX_DISTRO_VER=${{ inputs.LINUX_DISTRO_VER }}
LINUX_VER=${{ inputs.LINUX_VER }}
PYTHON_VER=${{ inputs.PYTHON_VER }}
RAPIDS_VER=${{ inputs.RAPIDS_VER }}
tags: ${{ inputs.BASE_TAG }}-${{ matrix.ARCH }}
- name: Build notebooks image
uses: docker/build-push-action@v6
with:
context: context
file: Dockerfile
target: notebooks
push: true
pull: true
build-args: |
CUDA_VER=${{ inputs.CUDA_VER }}
LINUX_DISTRO=${{ inputs.LINUX_DISTRO }}
LINUX_DISTRO_VER=${{ inputs.LINUX_DISTRO_VER }}
LINUX_VER=${{ inputs.LINUX_VER }}
PYTHON_VER=${{ inputs.PYTHON_VER }}
RAPIDS_VER=${{ inputs.RAPIDS_VER }}
tags: ${{ inputs.NOTEBOOKS_TAG }}-${{ matrix.ARCH }}
64 changes: 54 additions & 10 deletions .github/workflows/build-test-publish-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ jobs:
needs:
- checks
- compute-matrix
- build
- build-multiarch-manifest
- build-rapids
- build-rapids-multiarch-manifest
- build-cuvs
- build-cuvs-multiarch-manifest
- test
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
Expand Down Expand Up @@ -137,13 +139,13 @@ jobs:
export TEST_MATRIX
echo "TEST_MATRIX=$(yq -n -o json 'env(TEST_MATRIX)' | jq -c '{include: .}')" | tee --append "${GITHUB_OUTPUT}"
build:
build-rapids:
needs: [checks, compute-matrix]
strategy:
matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
fail-fast: false
secrets: inherit
uses: ./.github/workflows/build-image.yml
uses: ./.github/workflows/build-rapids-image.yml
with:
ARCHES: ${{ toJSON(matrix.ARCHES) }}
CUDA_VER: ${{ matrix.CUDA_VER }}
Expand All @@ -152,7 +154,6 @@ jobs:
LINUX_VER: ${{ matrix.LINUX_VER }}
PYTHON_VER: ${{ matrix.PYTHON_VER }}
RAPIDS_VER: ${{ needs.compute-matrix.outputs.RAPIDS_VER }}
BUILD_CUVS_BENCH_CPU_IMAGE: ${{ matrix.BUILD_CUVS_BENCH_CPU_IMAGE }}
BASE_TAG:
"rapidsai/${{ needs.compute-matrix.outputs.BASE_IMAGE_REPO }}:\
${{ needs.compute-matrix.outputs.BASE_TAG_PREFIX }}\
Expand All @@ -167,6 +168,22 @@ jobs:
${{ needs.compute-matrix.outputs.ALPHA_TAG }}-\
cuda${{ matrix.CUDA_TAG }}-\
py${{ matrix.PYTHON_VER }}"
build-cuvs:
needs: [checks, compute-matrix]
strategy:
matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
fail-fast: false
secrets: inherit
uses: ./.github/workflows/build-cuvs-image.yml
with:
ARCHES: ${{ toJSON(matrix.ARCHES) }}
CUDA_VER: ${{ matrix.CUDA_VER }}
LINUX_DISTRO: ${{ matrix.LINUX_DISTRO }}
LINUX_DISTRO_VER: ${{ matrix.LINUX_DISTRO_VER }}
LINUX_VER: ${{ matrix.LINUX_VER }}
PYTHON_VER: ${{ matrix.PYTHON_VER }}
RAPIDS_VER: ${{ needs.compute-matrix.outputs.RAPIDS_VER }}
BUILD_CUVS_BENCH_CPU_IMAGE: ${{ matrix.BUILD_CUVS_BENCH_CPU_IMAGE }}
CUVS_BENCH_TAG:
"rapidsai/${{ needs.compute-matrix.outputs.CUVS_BENCH_IMAGE_REPO }}:\
${{ needs.compute-matrix.outputs.CUVS_BENCH_TAG_PREFIX }}\
Expand All @@ -187,8 +204,8 @@ jobs:
${{ needs.compute-matrix.outputs.RAPIDS_VER }}\
${{ needs.compute-matrix.outputs.ALPHA_TAG }}-\
py${{ matrix.PYTHON_VER }}"
build-multiarch-manifest:
needs: [build, compute-matrix]
build-rapids-multiarch-manifest:
needs: [build-rapids, compute-matrix]
strategy:
matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
fail-fast: false
Expand All @@ -206,7 +223,6 @@ jobs:
- name: Create multiarch manifest
shell: bash
env:
CUVS_BENCH_CPU_IMAGE_BUILT: ${{ matrix.BUILD_CUVS_BENCH_CPU_IMAGE }}
BASE_IMAGE_REPO: ${{ needs.compute-matrix.outputs.BASE_IMAGE_REPO }}
BASE_TAG_PREFIX: ${{ needs.compute-matrix.outputs.BASE_TAG_PREFIX }}
RAPIDS_VER: ${{ needs.compute-matrix.outputs.RAPIDS_VER }}
Expand All @@ -215,6 +231,34 @@ jobs:
PYTHON_VER: ${{ matrix.PYTHON_VER }}
NOTEBOOKS_IMAGE_REPO: ${{ needs.compute-matrix.outputs.NOTEBOOKS_IMAGE_REPO }}
NOTEBOOKS_TAG_PREFIX: ${{ needs.compute-matrix.outputs.NOTEBOOKS_TAG_PREFIX }}
GPUCIBOT_DOCKERHUB_USER: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
GPUCIBOT_DOCKERHUB_TOKEN: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
ARCHES: ${{ toJSON(matrix.ARCHES) }}
run: ci/create-rapids-multiarch-manifest.sh
build-cuvs-multiarch-manifest:
needs: [build-cuvs, compute-matrix]
strategy:
matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
fail-fast: false
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
password: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
- name: Create multiarch manifest
shell: bash
env:
RAPIDS_VER: ${{ needs.compute-matrix.outputs.RAPIDS_VER }}
ALPHA_TAG: ${{ needs.compute-matrix.outputs.ALPHA_TAG }}
CUDA_TAG: ${{ matrix.CUDA_TAG }}
PYTHON_VER: ${{ matrix.PYTHON_VER }}
CUVS_BENCH_CPU_IMAGE_BUILT: ${{ matrix.BUILD_CUVS_BENCH_CPU_IMAGE }}
CUVS_BENCH_IMAGE_REPO: ${{ needs.compute-matrix.outputs.CUVS_BENCH_IMAGE_REPO }}
CUVS_BENCH_TAG_PREFIX: ${{ needs.compute-matrix.outputs.CUVS_BENCH_TAG_PREFIX }}
CUVS_BENCH_DATASETS_IMAGE_REPO: ${{ needs.compute-matrix.outputs.CUVS_BENCH_DATASETS_IMAGE_REPO }}
Expand All @@ -224,9 +268,9 @@ jobs:
GPUCIBOT_DOCKERHUB_USER: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
GPUCIBOT_DOCKERHUB_TOKEN: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
ARCHES: ${{ toJSON(matrix.ARCHES) }}
run: ci/create-multiarch-manifest.sh
run: ci/create-cuvs-multiarch-manifest.sh
test:
needs: [compute-matrix, build]
needs: [compute-matrix, build-rapids]
if: inputs.run_tests
strategy:
matrix: ${{ fromJSON(needs.compute-matrix.outputs.TEST_MATRIX) }}
Expand Down
13 changes: 10 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ ARG RAPIDS_VER

SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

RUN <<EOF
apt-get update
apt-get install -y wget
wget https://github.com/rapidsai/gha-tools/releases/latest/download/tools.tar.gz -O - | tar -xz -C /usr/local/bin
apt-get purge -y --auto-remove wget
rm -rf /var/lib/apt/lists/*
EOF
RUN useradd -rm -d /home/rapids -s /bin/bash -g conda -u 1001 rapids

USER rapids
Expand All @@ -57,7 +64,7 @@ conda config --show-sources
conda list --show-channel-urls

# Install RAPIDS
mamba install -y -n base \
rapids-mamba-retry install -y -n base \
"rapids=${RAPIDS_VER}.*" \
"python=${PYTHON_VER}.*" \
"cuda-version=${CUDA_VER%.*}.*" \
Expand Down Expand Up @@ -90,12 +97,12 @@ COPY --from=dependencies --chown=rapids /test_notebooks_dependencies.yaml test_n
COPY --from=dependencies --chown=rapids /notebooks /home/rapids/notebooks

RUN <<EOF
mamba env update -n base -f test_notebooks_dependencies.yaml
rapids-mamba-retry env update -n base -f test_notebooks_dependencies.yaml
conda clean -afy
EOF

RUN <<EOF
mamba install -y -n base \
rapids-mamba-retry install -y -n base \
"jupyterlab=4" \
dask-labextension \
jupyterlab-nvdashboard
Expand Down
29 changes: 29 additions & 0 deletions ci/common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash

set -eEuo pipefail

# Authenticate and retrieve DockerHub token
HUB_TOKEN=$(
curl -s -H "Content-Type: application/json" \
-X POST \
-d "{\"username\": \"$GPUCIBOT_DOCKERHUB_USER\", \"password\": \"$GPUCIBOT_DOCKERHUB_TOKEN\"}" \
https://hub.docker.com/v2/users/login/ | jq -r .token \
)
echo "::add-mask::${HUB_TOKEN}"
export HUB_TOKEN

# Function to check if a Docker tag exists
check_tag_exists() {
local repo="$1"
local tag="$2"
local exists
exists=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: JWT $HUB_TOKEN" \
"https://hub.docker.com/v2/repositories/${org}/${repo}/tags/${tag}/")

if [ "$exists" -ne 200 ]; then
echo "Error: Required image tag ${repo}:${tag} does not exist. This implies that the image was not built successfully in the build job."
exit 1
fi
}

export org="rapidsai"
Loading

0 comments on commit b2701d6

Please sign in to comment.