Skip to content

Update strategy matrix #7

Update strategy matrix

Update strategy matrix #7

name: ~test TransformerEngine tests on EKS cluster (8xH100)
on:
workflow_call:
inputs:
JAX_DOCKER_IMAGE:
type: string
description: 'URI of image to run tests on'
required: true
JOB_NAME:
type: string
description: 'Job name identifying the unique GitHub Actions run'
required: true
S3_BUCKET:
type: string
description: 'AWS S3 bucket to which logs will be uploaded for processing in CI'
required: true
CI_NAME:
type: string
description: 'Name of the CI'
required: true
permissions:
contents: read # to fetch code
actions: write # to cancel previous workflows
packages: write # to upload container
jobs:
transformer-engine-test-eks:
env:
TE_EKS_FILES_PREFIX: .github/eks-workflow-files/transformer-engine
RUN_NAME: ${{ inputs.JOB_NAME }}-${{ matrix.N_GPU }}-${{ matrix.TEST }}
strategy:
fail-fast: true
matrix:

Check failure on line 34 in .github/workflows/_transformer_engine_eks.yaml

View workflow run for this annotation

GitHub Actions / .github/workflows/_transformer_engine_eks.yaml

Invalid workflow file

You have an error in your yaml syntax on line 34
N_GPU: [2, 4, 8]
include:
TEST: [multigpu]
- N_GPU: [8]
TEST: [unittest]
runs-on: [eks] # H100
steps:
- name: Check out the repository
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: K8s GHCR store and delete token
id: store-token
uses: ./.github/actions/store-delete-k8s-ghcr
- name: Configure job manifest
run: |
K8S_JOB_TEMPLATE="${{ env.TE_EKS_FILES_PREFIX }}/test.yml"
K8S_JOB_MANIFEST="${{ env.TE_EKS_FILES_PREFIX }}/${{ matrix.TEST }}.yml"
SCRIPT="${{ env.TE_EKS_FILES_PREFIX }}/scripts/${{ matrix.TEST }}.sh"
sed '/SCRIPT/ {
r '$SCRIPT'
d
}' $K8S_JOB_TEMPLATE | \
tee $K8S_JOB_MANIFEST > /dev/null
cat $K8S_JOB_MANIFEST | \
sed s@JOB_NAME@${{ inputs.JOB_NAME }}@g | \
sed s@IMAGE_URI@${{ inputs.JAX_DOCKER_IMAGE }}@g | \
sed s@SECRETS_NAME@${{ steps.store-token.outputs.token-name }}@g | \
sed s@N_GPU@${{ matrix.N_GPU }}@g | \
tee $K8S_JOB_MANIFEST
- name: Submit & delete transformer engine unit test job
uses: ./.github/actions/submit-delete-k8s-job
with:
job-config-file: ${{ env.TE_EKS_FILES_PREFIX }}/${{ matrix.TEST }}.yml
job-name: ${{ env.RUN_NAME }}
- name: Download and process logs from S3
id: s3-logs-process
run: |
LOCAL_DIR=${{ inputs.CI_NAME }}-output
mkdir -p $LOCAL_DIR
# aws s3 cp s3://${{ inputs.S3_BUCKET }}/${{ inputs.CI_NAME }}/${{ inputs.RUN_NAME }}/summary.txt $LOCAL_DIR/
aws s3 cp s3://${{ inputs.S3_BUCKET }}/${{ inputs.CI_NAME }}/${{ inputs.RUN_NAME }}/ $LOCAL_DIR/ --recursive --exclude "*" --include "*.log"
passed=$(cat $LOCAL_DIR/tests.log | grep 'PASSED opt/transformer' | wc -l || true)
failed=$(cat $LOCAL_DIR/tests.log | grep 'FAILED opt/transformer' | wc -l || true)
total=$((failed + passed))
echo "Passed tests: $passed"
echo "Failed tests: $failed"
echo "Total tests: $total"
echo "PASSED_TESTS=$passed" >> $GITHUB_OUTPUT
echo "FAILED_TESTS=$failed" >> $GITHUB_OUTPUT
echo "TOTAL_TESTS=$total" >> $GITHUB_OUTPUT
- name: Generate sitrep
id: sitrep
if: ${{ !cancelled() }}
shell: bash -x -e {0}
run: |
# bring in utility functions
source .github/workflows/scripts/to_json.sh
badge_label='TransformerEngine EKS Unit'
total_tests=${{ steps.s3-logs-process.outputs.TOTAL_TESTS }} \
failed_tests=${{ steps.s3-logs-process.outputs.FAILED_TESTS }} \
passed_tests=${{ steps.s3-logs-process.outputs.PASSED_TESTS }} \
errors="0" \
summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
badge_message="Passed $passed_tests out of $total_tests." \
badge_color="brightgreen"
if [ "$failed_tests" -gt 0 ]; then
badge_color="red"
fi \
to_json \
summary \
errors total_tests passed_tests failed_tests \
badge_label badge_color badge_message \
> sitrep.json
schemaVersion=1 \
label="${badge_label}" \
message="Passed $passed_tests out of $total_tests." \
color=$badge_color \
to_json schemaVersion label message color \
> badge-transformer-engine-test.json
- name: Upload artifacts
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: "artifact-multigpu-test-${{ env.RUN_NAME }}"
path: |
sitrep.json
badge-transformer-engine-test.json
trasformer-engine-output/*