Revert unit test change #80
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: ~CI, single-arch | ||
run-name: CI-${{ inputs.ARCHITECTURE }} | ||
on: | ||
workflow_call: | ||
inputs: | ||
ARCHITECTURE: | ||
type: string | ||
required: true | ||
BUILD_DATE: | ||
type: string | ||
description: 'Build date in YYYY-MM-DD format' | ||
required: false | ||
default: NOT SPECIFIED | ||
CUDA_IMAGE: | ||
type: string | ||
description: CUDA image to use as base, e.g. nvidia/cuda:X.Y.Z-devel-ubuntu22.04 | ||
default: 'latest' | ||
required: false | ||
MANIFEST_ARTIFACT_NAME: | ||
type: string | ||
description: 'Artifact name in current run w/ manifest/patches. Leaving empty uses manifest/patches in current branch' | ||
default: '' | ||
required: false | ||
SOURCE_URLREFS: | ||
type: string | ||
description: 'A JSON object containing git url+refs for softwares to be built' | ||
required: false | ||
default: '{}' | ||
outputs: | ||
DOCKER_TAGS: | ||
description: 'JSON object containing tags of all docker images built' | ||
value: ${{ jobs.collect-docker-tags.outputs.TAGS }} | ||
permissions: | ||
contents: read # to fetch code | ||
actions: write # to cancel previous workflows | ||
packages: write # to upload container | ||
jobs: | ||
# build-base: | ||
# uses: ./.github/workflows/_build_base.yaml | ||
# with: | ||
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }} | ||
# BASE_IMAGE: ${{ inputs.CUDA_IMAGE }} | ||
# BUILD_DATE: ${{ inputs.BUILD_DATE }} | ||
# MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }} | ||
# secrets: inherit | ||
# | ||
# build-jax: | ||
# needs: build-base | ||
# uses: ./.github/workflows/_build.yaml | ||
# with: | ||
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }} | ||
# ARTIFACT_NAME: artifact-jax-build | ||
# BADGE_FILENAME: badge-jax-build | ||
# BUILD_DATE: ${{ inputs.BUILD_DATE }} | ||
# BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} | ||
# CONTAINER_NAME: jax | ||
# DOCKERFILE: .github/container/Dockerfile.jax | ||
# RUNNER_SIZE: large | ||
# EXTRA_BUILD_ARGS: | | ||
# URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }} | ||
# URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }} | ||
# URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }} | ||
# URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} | ||
# secrets: inherit | ||
# build-triton: | ||
# needs: build-jax | ||
# if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 | ||
# uses: ./.github/workflows/_build.yaml | ||
# with: | ||
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }} | ||
# ARTIFACT_NAME: artifact-triton-build | ||
# BADGE_FILENAME: badge-triton-build | ||
# BUILD_DATE: ${{ inputs.BUILD_DATE }} | ||
# BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} | ||
# CONTAINER_NAME: triton | ||
# DOCKERFILE: .github/container/Dockerfile.triton | ||
# RUNNER_SIZE: large | ||
# EXTRA_BUILD_ARGS: | | ||
# URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} | ||
# secrets: inherit | ||
# | ||
# build-equinox: | ||
# needs: build-jax | ||
# uses: ./.github/workflows/_build.yaml | ||
# with: | ||
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }} | ||
# ARTIFACT_NAME: artifact-equinox-build | ||
# BADGE_FILENAME: badge-equinox-build | ||
# BUILD_DATE: ${{ inputs.BUILD_DATE }} | ||
# BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} | ||
# CONTAINER_NAME: equinox | ||
# DOCKERFILE: .github/container/Dockerfile.equinox | ||
# EXTRA_BUILD_ARGS: | | ||
# URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} | ||
# secrets: inherit | ||
# | ||
# build-maxtext: | ||
# needs: build-jax | ||
# uses: ./.github/workflows/_build.yaml | ||
# with: | ||
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }} | ||
# ARTIFACT_NAME: artifact-maxtext-build | ||
# BADGE_FILENAME: badge-maxtext-build | ||
# BUILD_DATE: ${{ inputs.BUILD_DATE }} | ||
# BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} | ||
# CONTAINER_NAME: maxtext | ||
# DOCKERFILE: .github/container/Dockerfile.maxtext | ||
# EXTRA_BUILD_ARGS: | | ||
# URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} | ||
# secrets: inherit | ||
# | ||
# build-levanter: | ||
# needs: [build-jax] | ||
# uses: ./.github/workflows/_build.yaml | ||
# with: | ||
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }} | ||
# ARTIFACT_NAME: "artifact-levanter-build" | ||
# BADGE_FILENAME: "badge-levanter-build" | ||
# BUILD_DATE: ${{ inputs.BUILD_DATE }} | ||
# BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} | ||
# CONTAINER_NAME: levanter | ||
# DOCKERFILE: .github/container/Dockerfile.levanter | ||
# EXTRA_BUILD_ARGS: | | ||
# URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} | ||
# URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} | ||
# secrets: inherit | ||
# | ||
# build-upstream-t5x: | ||
# needs: build-jax | ||
# uses: ./.github/workflows/_build.yaml | ||
# with: | ||
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }} | ||
# ARTIFACT_NAME: "artifact-t5x-build" | ||
# BADGE_FILENAME: "badge-t5x-build" | ||
# BUILD_DATE: ${{ inputs.BUILD_DATE }} | ||
# BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} | ||
# CONTAINER_NAME: upstream-t5x | ||
# DOCKERFILE: .github/container/Dockerfile.t5x | ||
# EXTRA_BUILD_ARGS: | | ||
# URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} | ||
# URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} | ||
# secrets: inherit | ||
# | ||
# build-rosetta-t5x: | ||
# needs: build-upstream-t5x | ||
# uses: ./.github/workflows/_build_rosetta.yaml | ||
# with: | ||
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }} | ||
# BUILD_DATE: ${{ inputs.BUILD_DATE }} | ||
# BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} | ||
# BASE_LIBRARY: t5x | ||
# secrets: inherit | ||
# | ||
# build-gemma: | ||
# needs: build-jax | ||
# uses: ./.github/workflows/_build.yaml | ||
# if: inputs.ARCHITECTURE == 'amd64' # build only amd64 | ||
# with: | ||
# ARCHITECTURE: ${{ inputs.ARCHITECTURE }} | ||
# ARTIFACT_NAME: artifact-gemma-build | ||
# BADGE_FILENAME: badge-gemma-build | ||
# BUILD_DATE: ${{ inputs.BUILD_DATE }} | ||
# BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} | ||
# CONTAINER_NAME: gemma | ||
# DOCKERFILE: rosetta/Dockerfile.gemma | ||
# DOCKER_CONTEXT: . | ||
# EXTRA_BUILD_ARGS: | | ||
# URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} | ||
# URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} | ||
# URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} | ||
# URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} | ||
# URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} | ||
# secrets: inherit | ||
# collect-docker-tags: | ||
# runs-on: ubuntu-22.04 | ||
# if: "!cancelled()" | ||
# needs: | ||
# - build-base | ||
# - build-jax | ||
## - build-triton | ||
## - build-equinox | ||
## - build-maxtext | ||
## - build-levanter | ||
## - build-upstream-t5x | ||
## - build-rosetta-t5x | ||
## - build-gemma | ||
# outputs: | ||
# TAGS: ${{ steps.collect-tags.outputs.TAGS }} | ||
# | ||
# steps: | ||
# - name: Save docker tags as a JSON object | ||
# id: collect-tags | ||
# run: | | ||
# TAGS=$(cat <<EOF | jq -c | ||
# [\ | ||
# {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\ | ||
# {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\ | ||
# {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ | ||
# {}\ | ||
# ] | ||
# EOF | ||
# ) | ||
# | ||
# echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT | ||
# test-distribution: | ||
# runs-on: ubuntu-22.04 | ||
# strategy: | ||
# matrix: | ||
# TEST_SCRIPT: | ||
# - extra-only-distribution.sh | ||
# - mirror-only-distribution.sh | ||
# - upstream-only-distribution.sh | ||
# - local-patch-distribution.sh | ||
# fail-fast: false | ||
# steps: | ||
# - name: Print environment variables | ||
# run: env | ||
# - name: Set git login for tests | ||
# run: | | ||
# git config --global user.email "jax@nvidia.com" | ||
# git config --global user.name "JAX-Toolbox CI" | ||
# - name: Check out the repository under ${GITHUB_WORKSPACE} | ||
# uses: actions/checkout@v4 | ||
# - name: Run integration test ${{ matrix.TEST_SCRIPT }} | ||
# run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} | ||
# | ||
# test-jax: | ||
# needs: build-jax | ||
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a | ||
# uses: ./.github/workflows/_test_unit.yaml | ||
# with: | ||
# TEST_NAME: jax | ||
# EXECUTE: | | ||
# docker run -i --shm-size=1g --gpus all \ | ||
# ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ | ||
# bash <<"EOF" |& tee test-backend-independent.log | ||
# test-jax.sh -b backend-independent | ||
# EOF | ||
# docker run -i --shm-size=1g --gpus all \ | ||
# ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ | ||
# bash <<"EOF" |& tee tee test-gpu.log | ||
# nvidia-cuda-mps-control -d | ||
# test-jax.sh -b gpu | ||
# EOF | ||
# STATISTICS_SCRIPT: | | ||
# errors=$(cat test-*.log | grep -c 'ERROR:' || true) | ||
# failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) | ||
# passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) | ||
# total_tests=$((failed_tests + passed_tests)) | ||
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT | ||
# echo "ERRORS=${errors}" >> $GITHUB_OUTPUT | ||
# echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT | ||
# echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT | ||
# ARTIFACTS: | | ||
# test-backend-independent.log | ||
# test-gpu.log | ||
# secrets: inherit | ||
# | ||
# test-nsys-jax: | ||
# needs: build-jax | ||
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a | ||
# uses: ./.github/workflows/_test_unit.yaml | ||
# with: | ||
# TEST_NAME: nsys-jax | ||
# EXECUTE: | | ||
# set -o pipefail | ||
# num_tests=0 | ||
# num_failures=0 | ||
# # Run the pytest-driven tests; failure is explicitly handled below so set +e to | ||
# # avoid an early abort here. | ||
# set +e | ||
# docker run -i --shm-size=1g --gpus all \ | ||
# -v $PWD:/opt/output \ | ||
# ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ | ||
# bash <<"EOF" |& tee test-nsys-jax.log | ||
# # nsys-jax is already installed, this is just adding the test dependencies | ||
# pip install pytest-reportlog nsys-jax[test] | ||
# # abuse knowledge that nsys-jax is installed editable, so the tests exist | ||
# test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') | ||
# pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" | ||
# EOF | ||
# set -e | ||
# GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') | ||
# for mode in 1-process 2-process process-per-gpu; do | ||
# DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" | ||
# if [[ "${mode}" == "1-process" ]]; then | ||
# PROCESS_COUNT=1 | ||
# ARGS="" | ||
# elif [[ "${mode}" == "2-process" ]]; then | ||
# # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that | ||
# # this will flush out more bugs than process-per-node or process-per-GPU. | ||
# PROCESS_COUNT=2 | ||
# ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" | ||
# else | ||
# PROCESS_COUNT=${GPUS_PER_NODE} | ||
# ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" | ||
# fi | ||
# for collection in full partial; do | ||
# NSYS_JAX="nsys-jax" | ||
# if [[ "${mode}" == "1-process" ]]; then | ||
# # We will not run nsys-jax-combine, so run analyses eagerly | ||
# NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" | ||
# fi | ||
# NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" | ||
# if [[ "${collection}" == "partial" ]]; then | ||
# NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" | ||
# # nvbug/4801401 | ||
# NSYS_JAX+=" --sample=none" | ||
# fi | ||
# set +e | ||
# ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ | ||
# -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log | ||
# num_failures=$((num_failures + ($? != 0))) | ||
# set -e | ||
# num_tests=$((num_tests + 1)) | ||
# done | ||
# if [[ "${mode}" != "1-process" ]]; then | ||
# # Run nsys-jax-combine | ||
# NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" | ||
# for (( i=0; i<PROCESS_COUNT; i++ )); do | ||
# NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip" | ||
# done | ||
# set +e | ||
# ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log | ||
# num_failures=$((num_failures + ($? != 0))) | ||
# set -e | ||
# num_tests=$((num_tests + 1)) | ||
# fi | ||
# done | ||
# ls -R . | ||
# echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV | ||
# echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV | ||
# exit $num_failures | ||
# STATISTICS_SCRIPT: | | ||
# summary_line=$(tail -n1 test-nsys-jax.log) | ||
# num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') | ||
# passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) | ||
# failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) | ||
# total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) | ||
# num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) | ||
# num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) | ||
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT | ||
# echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT | ||
# echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT | ||
# echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT | ||
# ARTIFACTS: | | ||
# # pytest-driven part | ||
# test-nsys-jax.log | ||
# pytest-report.jsonl | ||
# # nsys-jax logfiles | ||
# *process-*-execution.log | ||
# # nsys-jax output for the case that doesn't use nsys-jax-combine | ||
# 1-process-*-execution-0.zip | ||
# # nsys-jax-combine output/logfiles | ||
# *process*-*-execution.zip | ||
# *-execution-combine.log | ||
# secrets: inherit | ||
# | ||
# # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test | ||
# # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does | ||
# # not already have nsys-jax installed | ||
# test-nsys-jax-archive: | ||
# needs: test-nsys-jax | ||
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a | ||
# strategy: | ||
# matrix: | ||
# os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] | ||
# runs-on: ${{ matrix.os }} | ||
# steps: | ||
# - name: Download nsys-jax output .zip files | ||
# uses: actions/download-artifact@v4 | ||
# with: | ||
# name: nsys-jax-unit-test-A100 | ||
# - name: Extract archives and execute install scripts | ||
# run: | | ||
# pip install virtualenv # for install.sh | ||
# for zip in $(ls *.zip); do | ||
# ZIP="${PWD}/${zip}" | ||
# pushd $(mktemp -d) | ||
# unzip "${ZIP}" | ||
# ls -l | ||
# # TODO: verify this isn't needed, or make sure it isn't needed | ||
# chmod 755 install.sh | ||
# # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout | ||
# # Skip executing Jupyter lab | ||
# NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh | ||
# popd | ||
# done | ||
# test-nsys-jax-eks: | ||
# needs: build-jax | ||
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a | ||
# runs-on: eks | ||
# env: | ||
# JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} | ||
# JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax | ||
# POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess | ||
# TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token | ||
# steps: | ||
# - name: Check out the repository | ||
# uses: actions/checkout@v4 | ||
# - name: Login to GitHub Container Registry | ||
# uses: docker/login-action@v3 | ||
# with: | ||
# registry: ghcr.io | ||
# username: ${{ github.repository_owner }} | ||
# password: ${{ secrets.GITHUB_TOKEN }} | ||
# - name: Store GitHub Container Registry token as Kubernetes secret | ||
# run: | | ||
# kubectl create secret generic \ | ||
# ${{ github.run_id }}-${{ github.run_attempt }}-token \ | ||
# --from-file=.dockerconfigjson=$HOME/.docker/config.json \ | ||
# --type=kubernetes.io/dockerconfigjson | ||
# - name: Configure Kubernetes job | ||
# run: | | ||
# yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) | ||
# | select(di == 1).metadata.name = strenv(JOB_NAME) | ||
# | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) | ||
# | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) | ||
# | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ | ||
# .github/eks-workflow-files/job.yml | ||
# git diff .github/eks-workflow-files/job.yml | ||
# - name: Submit Kubernetes job | ||
# run: kubectl apply -f .github/eks-workflow-files/job.yml | ||
# - name: Wait for Kubernetes job to start | ||
# run: | | ||
# while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do | ||
# sleep 2 | ||
# done | ||
# - name: Stream Kubernetes job output | ||
# run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax | ||
# # Clean up in case of errors as well as success | ||
# - name: Delete Kubernetes job | ||
# if: always() | ||
# run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax | ||
# - name: Configure post-processing job | ||
# run: | | ||
# export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" | ||
# yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) | ||
# | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) | ||
# | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) | ||
# | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ | ||
# .github/eks-workflow-files/post-process-job.yml | ||
# git diff .github/eks-workflow-files/post-process-job.yml | ||
# - name: Submit post-processing Kubernetes job | ||
# run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml | ||
# - name: Wait for post-processing Kubernetes job to start | ||
# run: | | ||
# while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do | ||
# sleep 2 | ||
# done | ||
# - name: Stream post-processing Kubernetes job output | ||
# run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess | ||
# # Clean up in case of errors as well as success | ||
# - name: Delete post-processing Kubernetes job | ||
# if: always() | ||
# run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess | ||
# - name: Delete GitHub Container Registry token | ||
# if: always() | ||
# run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token | ||
# test-equinox: | ||
# needs: build-equinox | ||
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a | ||
# uses: ./.github/workflows/_test_unit.yaml | ||
# with: | ||
# IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} | ||
# TEST_NAME: equinox | ||
# EXECUTE: | | ||
# docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ | ||
# bash -exc -o pipefail \ | ||
# 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log | ||
# STATISTICS_SCRIPT: | | ||
# errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') | ||
# failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') | ||
# passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') | ||
# total_tests=$((failed_tests + passed_tests)) | ||
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT | ||
# echo "ERRORS=${errors}" >> $GITHUB_OUTPUT | ||
# echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT | ||
# echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT | ||
# ARTIFACTS: | | ||
# test-equinox.log | ||
# secrets: inherit | ||
test-te-h100: | ||
# needs: build-jax | ||
if: inputs.ARCHITECTURE == 'amd64' | ||
uses: ./.github/workflows/_transformer_engine_eks.yaml | ||
with: | ||
# JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} | ||
JAX_DOCKER_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:13652377029-jax-amd64 | ||
JOB_NAME: transformerengine-${{ github.run_id }} | ||
S3_BUCKET: jax-toolbox-eks-output | ||
CI_NAME: transformer-engine | ||
secrets: inherit | ||
test-te-multigpu-a100: | ||
# needs: build-jax | ||
if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a | ||
uses: ./.github/workflows/_test_te.yaml | ||
with: | ||
#TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} | ||
TE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:13652377029-jax-amd64 | ||
secrets: inherit | ||
test-te-unit-a100: | ||
secrets: inherit | ||
#needs: build-jax | ||
if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a | ||
uses: ./.github/workflows/_test_unit.yaml | ||
with: | ||
TEST_NAME: te | ||
EXECUTE: | | ||
docker run -i --gpus all --shm-size=1g -v $PWD:/log \ | ||
ghcr.io/nvidia/jax-toolbox-internal:13652377029-jax-amd64 \ | ||
bash <<"EOF" |& tee test-te.log | ||
pip install pytest-reportlog pytest-xdist | ||
# Start MPS daemon | ||
nvidia-cuda-mps-control -d | ||
# TE's default is slightly different, without the hyphen | ||
export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE} | ||
# 1 GPU per worker, 6 workers per GPU | ||
pytest-xdist.sh 1 6 pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh | ||
# 8 GPUs per worker, 1 worker per GPU. pytest-xdist.sh allows aggregation | ||
# into a single .jsonl file of results from multiple pytest invocations | ||
# inside the test.sh script, so it's useful even with a single worker per | ||
# device. | ||
pytest-xdist.sh 8 1 pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh | ||
pytest-xdist.sh 8 1 pytest-report-L1-distributed-unittest.jsonl bash ${TE_PATH}/qa/L1_jax_distributed_unittest/test.sh | ||
# merge the log files | ||
cat \ | ||
pytest-report-L0-unittest.jsonl \ | ||
pytest-report-L0-distributed-unittest.jsonl \ | ||
pytest-report-L1-distributed-unittest.jsonl \ | ||
> pytest-report.jsonl | ||
EOF | ||
STATISTICS_SCRIPT: | | ||
summary_line=$(tail -n1 test-te.log) | ||
errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') | ||
passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) | ||
failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) | ||
total_tests=$((failed_tests + passed_tests)) | ||
echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT | ||
echo "ERRORS=${errors}" >> $GITHUB_OUTPUT | ||
echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT | ||
echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT | ||
TIMEOUT_MINUTES: 120 | ||
ARTIFACTS: | | ||
test-te.log | ||
pytest-report.jsonl | ||
secrets: inherit | ||
# te-unittests: | ||
# secrets: inherit | ||
# needs: build-jax | ||
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a | ||
# uses: ./.github/workflows/_test_unit.yaml | ||
# with: | ||
# TEST_NAME: te | ||
# EXECUTE: | | ||
# docker run -i --gpus all --shm-size=1g -v $PWD:/log \ | ||
# ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ | ||
# bash <<"EOF" |& tee test-te.log | ||
# pip install pytest-reportlog pytest-xdist | ||
# # Start MPS daemon | ||
# nvidia-cuda-mps-control -d | ||
# # TE's default is slightly different, without the hyphen | ||
# export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE} | ||
# # 1 GPU per worker, 6 workers per GPU | ||
# pytest-xdist.sh 1 6 pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh | ||
# EOF | ||
# | ||
# STATISTICS_SCRIPT: | | ||
# summary_line=$(tail -n1 test-te.log) | ||
# errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') | ||
# passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) | ||
# failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) | ||
# total_tests=$((failed_tests + passed_tests)) | ||
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT | ||
# echo "ERRORS=${errors}" >> $GITHUB_OUTPUT | ||
# echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT | ||
# echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT | ||
# | ||
# TIMEOUT_MINUTES: 120 | ||
# ARTIFACTS: | | ||
# test-te.log | ||
# pytest-report.jsonl | ||
# | ||
# test-upstream-t5x: | ||
# needs: build-upstream-t5x | ||
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a | ||
# uses: ./.github/workflows/_test_upstream_t5x.yaml | ||
# with: | ||
# T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} | ||
# secrets: inherit | ||
# | ||
# test-rosetta-t5x: | ||
# needs: build-rosetta-t5x | ||
# if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 | ||
# uses: ./.github/workflows/_test_t5x_rosetta.yaml | ||
# with: | ||
# T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} | ||
# secrets: inherit | ||
# | ||
# test-triton: | ||
# needs: build-triton | ||
# if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 | ||
# uses: ./.github/workflows/_test_unit.yaml | ||
# with: | ||
# TEST_NAME: triton | ||
# EXECUTE: | | ||
# docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ | ||
# ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ | ||
# bash <<"EOF" |& tee test-triton.log | ||
# # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on | ||
# # actually having a CUDA backend for pytoch | ||
# pip install --no-deps torch | ||
# python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml | ||
# EOF | ||
# STATISTICS_SCRIPT: | | ||
# curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; | ||
# total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) | ||
# errors=$(./yq '.testsuites."+@errors"' triton_test.xml) | ||
# failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) | ||
# passed_tests=$((total_tests - errors - failed_tests)) | ||
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT | ||
# echo "ERRORS=${errors}" >> $GITHUB_OUTPUT | ||
# echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT | ||
# echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT | ||
# ARTIFACTS: | | ||
# test-triton.log | ||
# secrets: inherit | ||
# | ||
# test-levanter: | ||
# needs: build-levanter | ||
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a | ||
# uses: ./.github/workflows/_test_unit.yaml | ||
# with: | ||
# TEST_NAME: levanter | ||
# EXECUTE: | | ||
# docker run -i --gpus all --shm-size=1g \ | ||
# ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ | ||
# bash <<"EOF" |& tee test-levanter.log | ||
# pip install flake8 pytest soundfile librosa | ||
# PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" | ||
# EOF | ||
# STATISTICS_SCRIPT: | | ||
# summary_line=$(tail -n1 test-levanter.log) | ||
# errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') | ||
# failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') | ||
# passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') | ||
# total_tests=$((failed_tests + passed_tests)) | ||
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT | ||
# echo "ERRORS=${errors}" >> $GITHUB_OUTPUT | ||
# echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT | ||
# echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT | ||
# ARTIFACTS: | | ||
# test-levanter.log | ||
# secrets: inherit | ||
# | ||
# # test-te: | ||
# # needs: build-upstream-pax | ||
# # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a | ||
# # uses: ./.github/workflows/_test_unit.yaml | ||
# # with: | ||
# # TEST_NAME: te | ||
# # EXECUTE: | | ||
# # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ | ||
# # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ | ||
# # bash <<"EOF" |& tee test-te.log | ||
# # pip install pytest-reportlog | ||
# # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax | ||
# # EOF | ||
# # STATISTICS_SCRIPT: | | ||
# # summary_line=$(tail -n1 test-te.log) | ||
# # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') | ||
# # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) | ||
# # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) | ||
# # total_tests=$((failed_tests + passed_tests)) | ||
# # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT | ||
# # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT | ||
# # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT | ||
# # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT | ||
# # TIMEOUT_MINUTES: 120 | ||
# # ARTIFACTS: | | ||
# # test-te.log | ||
# # pytest-report.jsonl | ||
# # secrets: inherit | ||
# | ||
# test-gemma: | ||
# needs: build-gemma | ||
# uses: ./.github/workflows/_test_unit.yaml | ||
# if: inputs.ARCHITECTURE == 'amd64' | ||
# with: | ||
# TEST_NAME: gemma | ||
# EXECUTE: | | ||
# docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ | ||
# bash -ec \ | ||
# "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log | ||
# STATISTICS_SCRIPT: | | ||
# summary_line=$(tail -n1 test-gemma.log) | ||
# errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') | ||
# failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') | ||
# passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') | ||
# total_tests=$((failed_tests + passed_tests)) | ||
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT | ||
# echo "ERRORS=${errors}" >> $GITHUB_OUTPUT | ||
# echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT | ||
# echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT | ||
# ARTIFACTS: | | ||
# test-gemma.log | ||
# secrets: inherit | ||
# | ||
# test-maxtext: | ||
# needs: build-maxtext | ||
# if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners | ||
# uses: ./.github/workflows/_test_maxtext.yaml | ||
# with: | ||
# MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} | ||
# secrets: inherit |