Skip to content

Update collect docker tags step for reduced workflow #74

Update collect docker tags step for reduced workflow

Update collect docker tags step for reduced workflow #74

Workflow file for this run

name: ~CI, single-arch
run-name: CI-${{ inputs.ARCHITECTURE }}
on:
workflow_call:
inputs:
ARCHITECTURE:
type: string
required: true
BUILD_DATE:
type: string
description: 'Build date in YYYY-MM-DD format'
required: false
default: NOT SPECIFIED
CUDA_IMAGE:
type: string
description: CUDA image to use as base, e.g. nvidia/cuda:X.Y.Z-devel-ubuntu22.04
default: 'latest'
required: false
MANIFEST_ARTIFACT_NAME:
type: string
description: 'Artifact name in current run w/ manifest/patches. Leaving empty uses manifest/patches in current branch'
default: ''
required: false
SOURCE_URLREFS:
type: string
description: 'A JSON object containing git url+refs for softwares to be built'
required: false
default: '{}'
outputs:
DOCKER_TAGS:
description: 'JSON object containing tags of all docker images built'
value: ${{ jobs.collect-docker-tags.outputs.TAGS }}
permissions:
contents: read # to fetch code
actions: write # to cancel previous workflows
packages: write # to upload container
jobs:
build-base:
uses: ./.github/workflows/_build_base.yaml
with:
ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
BASE_IMAGE: ${{ inputs.CUDA_IMAGE }}
BUILD_DATE: ${{ inputs.BUILD_DATE }}
MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }}
secrets: inherit
build-jax:
needs: build-base
uses: ./.github/workflows/_build.yaml
with:
ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
ARTIFACT_NAME: artifact-jax-build
BADGE_FILENAME: badge-jax-build
BUILD_DATE: ${{ inputs.BUILD_DATE }}
BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }}
CONTAINER_NAME: jax
DOCKERFILE: .github/container/Dockerfile.jax
RUNNER_SIZE: large
EXTRA_BUILD_ARGS: |
URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }}
URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }}
URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }}
URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
secrets: inherit
# build-triton:
# needs: build-jax
# if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
# uses: ./.github/workflows/_build.yaml
# with:
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
# ARTIFACT_NAME: artifact-triton-build
# BADGE_FILENAME: badge-triton-build
# BUILD_DATE: ${{ inputs.BUILD_DATE }}
# BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
# CONTAINER_NAME: triton
# DOCKERFILE: .github/container/Dockerfile.triton
# RUNNER_SIZE: large
# EXTRA_BUILD_ARGS: |
# URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
# secrets: inherit
#
# build-equinox:
# needs: build-jax
# uses: ./.github/workflows/_build.yaml
# with:
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
# ARTIFACT_NAME: artifact-equinox-build
# BADGE_FILENAME: badge-equinox-build
# BUILD_DATE: ${{ inputs.BUILD_DATE }}
# BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
# CONTAINER_NAME: equinox
# DOCKERFILE: .github/container/Dockerfile.equinox
# EXTRA_BUILD_ARGS: |
# URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
# secrets: inherit
#
# build-maxtext:
# needs: build-jax
# uses: ./.github/workflows/_build.yaml
# with:
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
# ARTIFACT_NAME: artifact-maxtext-build
# BADGE_FILENAME: badge-maxtext-build
# BUILD_DATE: ${{ inputs.BUILD_DATE }}
# BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
# CONTAINER_NAME: maxtext
# DOCKERFILE: .github/container/Dockerfile.maxtext
# EXTRA_BUILD_ARGS: |
# URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
# secrets: inherit
#
# build-levanter:
# needs: [build-jax]
# uses: ./.github/workflows/_build.yaml
# with:
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
# ARTIFACT_NAME: "artifact-levanter-build"
# BADGE_FILENAME: "badge-levanter-build"
# BUILD_DATE: ${{ inputs.BUILD_DATE }}
# BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
# CONTAINER_NAME: levanter
# DOCKERFILE: .github/container/Dockerfile.levanter
# EXTRA_BUILD_ARGS: |
# URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
# URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
# secrets: inherit
#
# build-upstream-t5x:
# needs: build-jax
# uses: ./.github/workflows/_build.yaml
# with:
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
# ARTIFACT_NAME: "artifact-t5x-build"
# BADGE_FILENAME: "badge-t5x-build"
# BUILD_DATE: ${{ inputs.BUILD_DATE }}
# BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
# CONTAINER_NAME: upstream-t5x
# DOCKERFILE: .github/container/Dockerfile.t5x
# EXTRA_BUILD_ARGS: |
# URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
# URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
# secrets: inherit
#
# build-rosetta-t5x:
# needs: build-upstream-t5x
# uses: ./.github/workflows/_build_rosetta.yaml
# with:
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
# BUILD_DATE: ${{ inputs.BUILD_DATE }}
# BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
# BASE_LIBRARY: t5x
# secrets: inherit
#
# build-gemma:
# needs: build-jax
# uses: ./.github/workflows/_build.yaml
# if: inputs.ARCHITECTURE == 'amd64' # build only amd64
# with:
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
# ARTIFACT_NAME: artifact-gemma-build
# BADGE_FILENAME: badge-gemma-build
# BUILD_DATE: ${{ inputs.BUILD_DATE }}
# BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
# CONTAINER_NAME: gemma
# DOCKERFILE: rosetta/Dockerfile.gemma
# DOCKER_CONTEXT: .
# EXTRA_BUILD_ARGS: |
# URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
# URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
# URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
# URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
# URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
# secrets: inherit
collect-docker-tags:
runs-on: ubuntu-22.04
if: "!cancelled()"
needs:
- build-base
- build-jax
# - build-triton
# - build-equinox
# - build-maxtext
# - build-levanter
# - build-upstream-t5x
# - build-rosetta-t5x
# - build-gemma
outputs:
TAGS: ${{ steps.collect-tags.outputs.TAGS }}
steps:

Check failure on line 194 in .github/workflows/_ci.yaml

View workflow run for this annotation

GitHub Actions / .github/workflows/_ci.yaml

Invalid workflow file

You have an error in your yaml syntax on line 194
- name: Save docker tags as a JSON object
id: collect-tags
run: |
TAGS=$(cat <<EOF | jq -c
[\
{"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
{"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
# {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
# {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
# {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
# {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
# {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
# {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
# {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
{"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
# {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
# {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
# {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
# {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
# {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
# {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
# {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
{}\
]
EOF
)
echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
# test-distribution:
# runs-on: ubuntu-22.04
# strategy:
# matrix:
# TEST_SCRIPT:
# - extra-only-distribution.sh
# - mirror-only-distribution.sh
# - upstream-only-distribution.sh
# - local-patch-distribution.sh
# fail-fast: false
# steps:
# - name: Print environment variables
# run: env
# - name: Set git login for tests
# run: |
# git config --global user.email "jax@nvidia.com"
# git config --global user.name "JAX-Toolbox CI"
# - name: Check out the repository under ${GITHUB_WORKSPACE}
# uses: actions/checkout@v4
# - name: Run integration test ${{ matrix.TEST_SCRIPT }}
# run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
#
# test-jax:
# needs: build-jax
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
# uses: ./.github/workflows/_test_unit.yaml
# with:
# TEST_NAME: jax
# EXECUTE: |
# docker run -i --shm-size=1g --gpus all \
# ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
# bash <<"EOF" |& tee test-backend-independent.log
# test-jax.sh -b backend-independent
# EOF
# docker run -i --shm-size=1g --gpus all \
# ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
# bash <<"EOF" |& tee tee test-gpu.log
# nvidia-cuda-mps-control -d
# test-jax.sh -b gpu
# EOF
# STATISTICS_SCRIPT: |
# errors=$(cat test-*.log | grep -c 'ERROR:' || true)
# failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
# passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
# total_tests=$((failed_tests + passed_tests))
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
# echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
# echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
# echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
# ARTIFACTS: |
# test-backend-independent.log
# test-gpu.log
# secrets: inherit
#
# test-nsys-jax:
# needs: build-jax
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
# uses: ./.github/workflows/_test_unit.yaml
# with:
# TEST_NAME: nsys-jax
# EXECUTE: |
# set -o pipefail
# num_tests=0
# num_failures=0
# # Run the pytest-driven tests; failure is explicitly handled below so set +e to
# # avoid an early abort here.
# set +e
# docker run -i --shm-size=1g --gpus all \
# -v $PWD:/opt/output \
# ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
# bash <<"EOF" |& tee test-nsys-jax.log
# # nsys-jax is already installed, this is just adding the test dependencies
# pip install pytest-reportlog nsys-jax[test]
# # abuse knowledge that nsys-jax is installed editable, so the tests exist
# test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
# pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
# EOF
# set -e
# GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
# for mode in 1-process 2-process process-per-gpu; do
# DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
# if [[ "${mode}" == "1-process" ]]; then
# PROCESS_COUNT=1
# ARGS=""
# elif [[ "${mode}" == "2-process" ]]; then
# # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
# # this will flush out more bugs than process-per-node or process-per-GPU.
# PROCESS_COUNT=2
# ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
# else
# PROCESS_COUNT=${GPUS_PER_NODE}
# ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
# fi
# for collection in full partial; do
# NSYS_JAX="nsys-jax"
# if [[ "${mode}" == "1-process" ]]; then
# # We will not run nsys-jax-combine, so run analyses eagerly
# NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
# fi
# NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
# if [[ "${collection}" == "partial" ]]; then
# NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
# # nvbug/4801401
# NSYS_JAX+=" --sample=none"
# fi
# set +e
# ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
# -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
# num_failures=$((num_failures + ($? != 0)))
# set -e
# num_tests=$((num_tests + 1))
# done
# if [[ "${mode}" != "1-process" ]]; then
# # Run nsys-jax-combine
# NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
# for (( i=0; i<PROCESS_COUNT; i++ )); do
# NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
# done
# set +e
# ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
# num_failures=$((num_failures + ($? != 0)))
# set -e
# num_tests=$((num_tests + 1))
# fi
# done
# ls -R .
# echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
# echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
# exit $num_failures
# STATISTICS_SCRIPT: |
# summary_line=$(tail -n1 test-nsys-jax.log)
# num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
# passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
# failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
# total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
# num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
# num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
# echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
# echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
# echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
# ARTIFACTS: |
# # pytest-driven part
# test-nsys-jax.log
# pytest-report.jsonl
# # nsys-jax logfiles
# *process-*-execution.log
# # nsys-jax output for the case that doesn't use nsys-jax-combine
# 1-process-*-execution-0.zip
# # nsys-jax-combine output/logfiles
# *process*-*-execution.zip
# *-execution-combine.log
# secrets: inherit
#
# # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
# # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
# # not already have nsys-jax installed
# test-nsys-jax-archive:
# needs: test-nsys-jax
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
# strategy:
# matrix:
# os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
# runs-on: ${{ matrix.os }}
# steps:
# - name: Download nsys-jax output .zip files
# uses: actions/download-artifact@v4
# with:
# name: nsys-jax-unit-test-A100
# - name: Extract archives and execute install scripts
# run: |
# pip install virtualenv # for install.sh
# for zip in $(ls *.zip); do
# ZIP="${PWD}/${zip}"
# pushd $(mktemp -d)
# unzip "${ZIP}"
# ls -l
# # TODO: verify this isn't needed, or make sure it isn't needed
# chmod 755 install.sh
# # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
# # Skip executing Jupyter lab
# NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
# popd
# done
# test-nsys-jax-eks:
# needs: build-jax
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
# runs-on: eks
# env:
# JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
# JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax
# POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
# TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
# steps:
# - name: Check out the repository
# uses: actions/checkout@v4
# - name: Login to GitHub Container Registry
# uses: docker/login-action@v3
# with:
# registry: ghcr.io
# username: ${{ github.repository_owner }}
# password: ${{ secrets.GITHUB_TOKEN }}
# - name: Store GitHub Container Registry token as Kubernetes secret
# run: |
# kubectl create secret generic \
# ${{ github.run_id }}-${{ github.run_attempt }}-token \
# --from-file=.dockerconfigjson=$HOME/.docker/config.json \
# --type=kubernetes.io/dockerconfigjson
# - name: Configure Kubernetes job
# run: |
# yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
# | select(di == 1).metadata.name = strenv(JOB_NAME)
# | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
# | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
# | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
# .github/eks-workflow-files/job.yml
# git diff .github/eks-workflow-files/job.yml
# - name: Submit Kubernetes job
# run: kubectl apply -f .github/eks-workflow-files/job.yml
# - name: Wait for Kubernetes job to start
# run: |
# while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
# sleep 2
# done
# - name: Stream Kubernetes job output
# run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
# # Clean up in case of errors as well as success
# - name: Delete Kubernetes job
# if: always()
# run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
# - name: Configure post-processing job
# run: |
# export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
# yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
# | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
# | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
# | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
# .github/eks-workflow-files/post-process-job.yml
# git diff .github/eks-workflow-files/post-process-job.yml
# - name: Submit post-processing Kubernetes job
# run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml
# - name: Wait for post-processing Kubernetes job to start
# run: |
# while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
# sleep 2
# done
# - name: Stream post-processing Kubernetes job output
# run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
# # Clean up in case of errors as well as success
# - name: Delete post-processing Kubernetes job
# if: always()
# run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
# - name: Delete GitHub Container Registry token
# if: always()
# run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
# test-equinox:
# needs: build-equinox
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
# uses: ./.github/workflows/_test_unit.yaml
# with:
# IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
# TEST_NAME: equinox
# EXECUTE: |
# docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
# bash -exc -o pipefail \
# 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
# STATISTICS_SCRIPT: |
# errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
# failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
# passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
# total_tests=$((failed_tests + passed_tests))
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
# echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
# echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
# echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
# ARTIFACTS: |
# test-equinox.log
# secrets: inherit
test-te-multigpu:
needs: build-jax
if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
uses: ./.github/workflows/_test_te.yaml
with:
TE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
secrets: inherit
# test-upstream-t5x:
# needs: build-upstream-t5x
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
# uses: ./.github/workflows/_test_upstream_t5x.yaml
# with:
# T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
# secrets: inherit
#
# test-rosetta-t5x:
# needs: build-rosetta-t5x
# if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
# uses: ./.github/workflows/_test_t5x_rosetta.yaml
# with:
# T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
# secrets: inherit
#
# test-triton:
# needs: build-triton
# if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
# uses: ./.github/workflows/_test_unit.yaml
# with:
# TEST_NAME: triton
# EXECUTE: |
# docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
# ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
# bash <<"EOF" |& tee test-triton.log
# # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
# # actually having a CUDA backend for pytoch
# pip install --no-deps torch
# python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
# EOF
# STATISTICS_SCRIPT: |
# curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
# total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
# errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
# failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
# passed_tests=$((total_tests - errors - failed_tests))
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
# echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
# echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
# echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
# ARTIFACTS: |
# test-triton.log
# secrets: inherit
#
# test-levanter:
# needs: build-levanter
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
# uses: ./.github/workflows/_test_unit.yaml
# with:
# TEST_NAME: levanter
# EXECUTE: |
# docker run -i --gpus all --shm-size=1g \
# ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
# bash <<"EOF" |& tee test-levanter.log
# pip install flake8 pytest soundfile librosa
# PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
# EOF
# STATISTICS_SCRIPT: |
# summary_line=$(tail -n1 test-levanter.log)
# errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
# failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
# passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
# total_tests=$((failed_tests + passed_tests))
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
# echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
# echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
# echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
# ARTIFACTS: |
# test-levanter.log
# secrets: inherit
#
# # test-te:
# # needs: build-upstream-pax
# # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
# # uses: ./.github/workflows/_test_unit.yaml
# # with:
# # TEST_NAME: te
# # EXECUTE: |
# # docker run -i --gpus all --shm-size=1g -v $PWD:/log \
# # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
# # bash <<"EOF" |& tee test-te.log
# # pip install pytest-reportlog
# # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
# # EOF
# # STATISTICS_SCRIPT: |
# # summary_line=$(tail -n1 test-te.log)
# # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
# # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
# # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
# # total_tests=$((failed_tests + passed_tests))
# # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
# # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
# # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
# # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
# # TIMEOUT_MINUTES: 120
# # ARTIFACTS: |
# # test-te.log
# # pytest-report.jsonl
# # secrets: inherit
#
# test-gemma:
# needs: build-gemma
# uses: ./.github/workflows/_test_unit.yaml
# if: inputs.ARCHITECTURE == 'amd64'
# with:
# TEST_NAME: gemma
# EXECUTE: |
# docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
# bash -ec \
# "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
# STATISTICS_SCRIPT: |
# summary_line=$(tail -n1 test-gemma.log)
# errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
# failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
# passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
# total_tests=$((failed_tests + passed_tests))
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
# echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
# echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
# echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
# ARTIFACTS: |
# test-gemma.log
# secrets: inherit
#
# test-maxtext:
# needs: build-maxtext
# if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
# uses: ./.github/workflows/_test_maxtext.yaml
# with:
# MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
# secrets: inherit