Skip to content

Commit 41c9f94

Browse files
committed
Refactor transformer engine eks workflow into separate workflow
1 parent be24e69 commit 41c9f94

File tree

2 files changed

+121
-89
lines changed

2 files changed

+121
-89
lines changed

.github/workflows/_ci.yaml

+5-89
Original file line numberDiff line numberDiff line change
@@ -488,103 +488,19 @@ jobs:
488488
# ARTIFACTS: |
489489
# test-equinox.log
490490
# secrets: inherit
491-
test-transformerengine-eks:
491+
test-transformerengine-h100:
492492
# needs: build-jax
493493
if: inputs.ARCHITECTURE == 'amd64'
494494
runs-on: eks
495-
env:
495+
uses: ./.github/workflows/transformer-engine/_unittests_eks.yaml
496+
with:
496497
# JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
497498
JAX_DOCKER_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:13652377029-jax-amd64
498499
JOB_NAME: transformerengine-${{ github.run_id }}
499500
S3_BUCKET: jax-toolbox-eks-output
500501
CI_NAME: transformer-engine
501-
steps:
502-
- name: Check out the repository
503-
uses: actions/checkout@v4
504-
- name: Login to GitHub Container Registry
505-
uses: docker/login-action@v3
506-
with:
507-
registry: ghcr.io
508-
username: ${{ github.repository_owner }}
509-
password: ${{ secrets.GITHUB_TOKEN }}
510-
- name: K8s GHCR store and delete token
511-
id: store-token
512-
uses: ./.github/actions/store-delete-k8s-ghcr
513-
- name: Configure job manifest
514-
run: |
515-
cat .github/eks-workflow-files/transformer-engine/unit-tests.yml | \
516-
sed s@JOB_NAME@${{ env.JOB_NAME }}@g | \
517-
sed s@IMAGE_URI@${{ env.JAX_DOCKER_IMAGE }}@g | \
518-
sed s@SECRETS_NAME@${{ steps.store-token.outputs.token-name }}@g | \
519-
tee .github/eks-workflow-files/transformer-engine/unit-tests.yml
520-
- name: Submit & delete transformer engine unit test job
521-
uses: ./.github/actions/submit-delete-k8s-job
522-
with:
523-
job-config-file: .github/eks-workflow-files/transformer-engine/unit-tests.yml
524-
job-name: ${{ env.JOB_NAME }}
525-
- name: Download and process logs from S3
526-
id: s3-logs-process
527-
run: |
528-
LOCAL_DIR=${{ env.CI_NAME }}-output
502+
secrets: inherit
529503

530-
mkdir -p $LOCAL_DIR
531-
# aws s3 cp s3://${{ env.S3_BUCKET }}/${{ env.CI_NAME }}/${{ env.JOB_NAME }}/summary.txt $LOCAL_DIR/
532-
aws s3 cp s3://${{ env.S3_BUCKET }}/${{ env.CI_NAME }}/${{ env.JOB_NAME }}/ $LOCAL_DIR/ --recursive --exclude "*" --include "*.log"
533-
534-
passed=$(cat $LOCAL_DIR/tests.log | grep 'PASSED opt/transformer' | wc -l || true)
535-
failed=$(cat $LOCAL_DIR/tests.log | grep 'FAILED opt/transformer' | wc -l || true)
536-
537-
total=$((failed + passed))
538-
echo "Passed tests: $passed"
539-
echo "Failed tests: $failed"
540-
echo "Total tests: $total"
541-
echo "PASSED_TESTS=$passed" >> $GITHUB_OUTPUT
542-
echo "FAILED_TESTS=$failed" >> $GITHUB_OUTPUT
543-
echo "TOTAL_TESTS=$total" >> $GITHUB_OUTPUT
544-
545-
- name: Generate sitrep
546-
id: sitrep
547-
if: ${{ !cancelled() }}
548-
shell: bash -x -e {0}
549-
run: |
550-
# bring in utility functions
551-
source .github/workflows/scripts/to_json.sh
552-
553-
badge_label='TransformerEngine EKS Unit'
554-
555-
total_tests=${{ steps.s3-logs-process.outputs.TOTAL_TESTS }} \
556-
failed_tests=${{ steps.s3-logs-process.outputs.FAILED_TESTS }} \
557-
passed_tests=${{ steps.s3-logs-process.outputs.PASSED_TESTS }} \
558-
errors="0" \
559-
summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
560-
badge_message="Passed $passed_tests out of $total_tests." \
561-
badge_color="brightgreen"
562-
if [ "$failed_tests" -gt 0 ]; then
563-
badge_color="red"
564-
fi \
565-
566-
to_json \
567-
summary \
568-
errors total_tests passed_tests failed_tests \
569-
badge_label badge_color badge_message \
570-
> sitrep.json
571-
572-
schemaVersion=1 \
573-
label="${badge_label}" \
574-
message="Passed $passed_tests out of $total_tests." \
575-
color=$badge_color \
576-
to_json schemaVersion label message color \
577-
> badge-transformer-engine-test.json
578-
579-
- name: Upload artifacts
580-
if: ${{ !cancelled() }}
581-
uses: actions/upload-artifact@v4
582-
with:
583-
name: "artifact-transformer-engine-test"
584-
path: |
585-
sitrep.json
586-
badge-transformer-engine-test.json
587-
trasformer-engine-output/*
588504

589505
# te-unittests:
590506
# secrets: inherit
@@ -753,4 +669,4 @@ jobs:
753669
# uses: ./.github/workflows/_test_maxtext.yaml
754670
# with:
755671
# MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
756-
# secrets: inherit
672+
# secrets: inherit
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
name: ~test TransformerEngine unittests EKS
2+
on:
3+
workflow_call:
4+
inputs:
5+
JAX_DOCKER_IMAGE:
6+
type: string
7+
description: 'URI of image to run tests on'
8+
required: true
9+
JOB_NAME:
10+
type: string
11+
description: 'Job name identifying the unique GitHub Actions run'
12+
required: true
13+
S3_BUCKET:
14+
type: string
15+
description: 'AWS S3 bucket to which logs will be uploaded for processing in CI'
16+
required: true
17+
CI_NAME:
18+
type: string
19+
description: 'Name of the CI'
20+
required: true
21+
22+
permissions:
23+
contents: read # to fetch code
24+
actions: write # to cancel previous workflows
25+
packages: write # to upload container
26+
27+
jobs:
28+
transformer-engine-unittest-eks:
29+
runs-on: [eks] # H100
30+
steps:
31+
- name: Check out the repository
32+
uses: actions/checkout@v4
33+
- name: Login to GitHub Container Registry
34+
uses: docker/login-action@v3
35+
with:
36+
registry: ghcr.io
37+
username: ${{ github.repository_owner }}
38+
password: ${{ secrets.GITHUB_TOKEN }}
39+
- name: K8s GHCR store and delete token
40+
id: store-token
41+
uses: ./.github/actions/store-delete-k8s-ghcr
42+
- name: Configure job manifest
43+
run: |
44+
cat .github/eks-workflow-files/transformer-engine/unit-tests.yml | \
45+
sed s@JOB_NAME@${{ inputs.JOB_NAME }}@g | \
46+
sed s@IMAGE_URI@${{ inputs.JAX_DOCKER_IMAGE }}@g | \
47+
sed s@SECRETS_NAME@${{ steps.store-token.outputs.token-name }}@g | \
48+
tee .github/eks-workflow-files/transformer-engine/unit-tests.yml
49+
- name: Submit & delete transformer engine unit test job
50+
uses: ./.github/actions/submit-delete-k8s-job
51+
with:
52+
job-config-file: .github/eks-workflow-files/transformer-engine/unit-tests.yml
53+
job-name: ${{ inputs.JOB_NAME }}
54+
- name: Download and process logs from S3
55+
id: s3-logs-process
56+
run: |
57+
LOCAL_DIR=${{ inputs.CI_NAME }}-output
58+
59+
mkdir -p $LOCAL_DIR
60+
# aws s3 cp s3://${{ inputs.S3_BUCKET }}/${{ inputs.CI_NAME }}/${{ inputs.JOB_NAME }}/summary.txt $LOCAL_DIR/
61+
aws s3 cp s3://${{ inputs.S3_BUCKET }}/${{ inputs.CI_NAME }}/${{ inputs.JOB_NAME }}/ $LOCAL_DIR/ --recursive --exclude "*" --include "*.log"
62+
63+
passed=$(cat $LOCAL_DIR/tests.log | grep 'PASSED opt/transformer' | wc -l || true)
64+
failed=$(cat $LOCAL_DIR/tests.log | grep 'FAILED opt/transformer' | wc -l || true)
65+
66+
total=$((failed + passed))
67+
echo "Passed tests: $passed"
68+
echo "Failed tests: $failed"
69+
echo "Total tests: $total"
70+
echo "PASSED_TESTS=$passed" >> $GITHUB_OUTPUT
71+
echo "FAILED_TESTS=$failed" >> $GITHUB_OUTPUT
72+
echo "TOTAL_TESTS=$total" >> $GITHUB_OUTPUT
73+
74+
- name: Generate sitrep
75+
id: sitrep
76+
if: ${{ !cancelled() }}
77+
shell: bash -x -e {0}
78+
run: |
79+
# bring in utility functions
80+
source .github/workflows/scripts/to_json.sh
81+
82+
badge_label='TransformerEngine EKS Unit'
83+
84+
total_tests=${{ steps.s3-logs-process.outputs.TOTAL_TESTS }} \
85+
failed_tests=${{ steps.s3-logs-process.outputs.FAILED_TESTS }} \
86+
passed_tests=${{ steps.s3-logs-process.outputs.PASSED_TESTS }} \
87+
errors="0" \
88+
summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
89+
badge_message="Passed $passed_tests out of $total_tests." \
90+
badge_color="brightgreen"
91+
if [ "$failed_tests" -gt 0 ]; then
92+
badge_color="red"
93+
fi \
94+
95+
to_json \
96+
summary \
97+
errors total_tests passed_tests failed_tests \
98+
badge_label badge_color badge_message \
99+
> sitrep.json
100+
101+
schemaVersion=1 \
102+
label="${badge_label}" \
103+
message="Passed $passed_tests out of $total_tests." \
104+
color=$badge_color \
105+
to_json schemaVersion label message color \
106+
> badge-transformer-engine-test.json
107+
108+
- name: Upload artifacts
109+
if: ${{ !cancelled() }}
110+
uses: actions/upload-artifact@v4
111+
with:
112+
name: "artifact-transformer-engine-test"
113+
path: |
114+
sitrep.json
115+
badge-transformer-engine-test.json
116+
trasformer-engine-output/*

0 commit comments

Comments
 (0)