Update strategy matrix #6
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: ~test TransformerEngine tests on EKS cluster (8xH100) | ||
on: | ||
workflow_call: | ||
inputs: | ||
JAX_DOCKER_IMAGE: | ||
type: string | ||
description: 'URI of image to run tests on' | ||
required: true | ||
JOB_NAME: | ||
type: string | ||
description: 'Job name identifying the unique GitHub Actions run' | ||
required: true | ||
S3_BUCKET: | ||
type: string | ||
description: 'AWS S3 bucket to which logs will be uploaded for processing in CI' | ||
required: true | ||
CI_NAME: | ||
type: string | ||
description: 'Name of the CI' | ||
required: true | ||
permissions: | ||
contents: read # to fetch code | ||
actions: write # to cancel previous workflows | ||
packages: write # to upload container | ||
jobs: | ||
transformer-engine-test-eks: | ||
env: | ||
TE_EKS_FILES_PREFIX: .github/eks-workflow-files/transformer-engine | ||
RUN_NAME: ${{ inputs.JOB_NAME }}-${{ matrix.N_GPU }}-${{ matrix.TEST }} | ||
strategy: | ||
fail-fast: true | ||
matrix: | ||
- N_GPU: [2, 4, 8] | ||
include: | ||
TEST: [multigpu] | ||
- N_GPU: [8] | ||
TEST: [unittest] | ||
runs-on: [eks] # H100 | ||
steps: | ||
- name: Check out the repository | ||
uses: actions/checkout@v4 | ||
- name: Login to GitHub Container Registry | ||
uses: docker/login-action@v3 | ||
with: | ||
registry: ghcr.io | ||
username: ${{ github.repository_owner }} | ||
password: ${{ secrets.GITHUB_TOKEN }} | ||
- name: K8s GHCR store and delete token | ||
id: store-token | ||
uses: ./.github/actions/store-delete-k8s-ghcr | ||
- name: Configure job manifest | ||
run: | | ||
K8S_JOB_TEMPLATE="${{ env.TE_EKS_FILES_PREFIX }}/test.yml" | ||
K8S_JOB_MANIFEST="${{ env.TE_EKS_FILES_PREFIX }}/${{ matrix.TEST }}.yml" | ||
SCRIPT="${{ env.TE_EKS_FILES_PREFIX }}/scripts/${{ matrix.TEST }}.sh" | ||
sed '/SCRIPT/ { | ||
r '$SCRIPT' | ||
d | ||
}' $K8S_JOB_TEMPLATE | \ | ||
tee $K8S_JOB_MANIFEST > /dev/null | ||
cat $K8S_JOB_MANIFEST | \ | ||
sed s@JOB_NAME@${{ inputs.JOB_NAME }}@g | \ | ||
sed s@IMAGE_URI@${{ inputs.JAX_DOCKER_IMAGE }}@g | \ | ||
sed s@SECRETS_NAME@${{ steps.store-token.outputs.token-name }}@g | \ | ||
sed s@N_GPU@${{ matrix.N_GPU }}@g | \ | ||
tee $K8S_JOB_MANIFEST | ||
- name: Submit & delete transformer engine unit test job | ||
uses: ./.github/actions/submit-delete-k8s-job | ||
with: | ||
job-config-file: ${{ env.TE_EKS_FILES_PREFIX }}/${{ matrix.TEST }}.yml | ||
job-name: ${{ env.RUN_NAME }} | ||
- name: Download and process logs from S3 | ||
id: s3-logs-process | ||
run: | | ||
LOCAL_DIR=${{ inputs.CI_NAME }}-output | ||
mkdir -p $LOCAL_DIR | ||
# aws s3 cp s3://${{ inputs.S3_BUCKET }}/${{ inputs.CI_NAME }}/${{ inputs.RUN_NAME }}/summary.txt $LOCAL_DIR/ | ||
aws s3 cp s3://${{ inputs.S3_BUCKET }}/${{ inputs.CI_NAME }}/${{ inputs.RUN_NAME }}/ $LOCAL_DIR/ --recursive --exclude "*" --include "*.log" | ||
passed=$(cat $LOCAL_DIR/tests.log | grep 'PASSED opt/transformer' | wc -l || true) | ||
failed=$(cat $LOCAL_DIR/tests.log | grep 'FAILED opt/transformer' | wc -l || true) | ||
total=$((failed + passed)) | ||
echo "Passed tests: $passed" | ||
echo "Failed tests: $failed" | ||
echo "Total tests: $total" | ||
echo "PASSED_TESTS=$passed" >> $GITHUB_OUTPUT | ||
echo "FAILED_TESTS=$failed" >> $GITHUB_OUTPUT | ||
echo "TOTAL_TESTS=$total" >> $GITHUB_OUTPUT | ||
- name: Generate sitrep | ||
id: sitrep | ||
if: ${{ !cancelled() }} | ||
shell: bash -x -e {0} | ||
run: | | ||
# bring in utility functions | ||
source .github/workflows/scripts/to_json.sh | ||
badge_label='TransformerEngine EKS Unit' | ||
total_tests=${{ steps.s3-logs-process.outputs.TOTAL_TESTS }} \ | ||
failed_tests=${{ steps.s3-logs-process.outputs.FAILED_TESTS }} \ | ||
passed_tests=${{ steps.s3-logs-process.outputs.PASSED_TESTS }} \ | ||
errors="0" \ | ||
summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \ | ||
badge_message="Passed $passed_tests out of $total_tests." \ | ||
badge_color="brightgreen" | ||
if [ "$failed_tests" -gt 0 ]; then | ||
badge_color="red" | ||
fi \ | ||
to_json \ | ||
summary \ | ||
errors total_tests passed_tests failed_tests \ | ||
badge_label badge_color badge_message \ | ||
> sitrep.json | ||
schemaVersion=1 \ | ||
label="${badge_label}" \ | ||
message="Passed $passed_tests out of $total_tests." \ | ||
color=$badge_color \ | ||
to_json schemaVersion label message color \ | ||
> badge-transformer-engine-test.json | ||
- name: Upload artifacts | ||
if: ${{ !cancelled() }} | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: "artifact-multigpu-test-${{ env.RUN_NAME }}" | ||
path: | | ||
sitrep.json | ||
badge-transformer-engine-test.json | ||
trasformer-engine-output/* |