66
66
URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
67
67
secrets : inherit
68
68
69
- build-triton :
70
- needs : build-jax
71
- if : inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
72
- uses : ./.github/workflows/_build.yaml
73
- with :
74
- ARCHITECTURE : ${{ inputs.ARCHITECTURE }}
75
- ARTIFACT_NAME : artifact-triton-build
76
- BADGE_FILENAME : badge-triton-build
77
- BUILD_DATE : ${{ inputs.BUILD_DATE }}
78
- BASE_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
79
- CONTAINER_NAME : triton
80
- DOCKERFILE : .github/container/Dockerfile.triton
81
- RUNNER_SIZE : large
82
- EXTRA_BUILD_ARGS : |
83
- URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
84
- secrets : inherit
85
-
86
69
build-equinox :
87
70
needs : build-jax
88
71
uses : ./.github/workflows/_build.yaml
@@ -176,9 +159,23 @@ jobs:
176
159
URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
177
160
secrets : inherit
178
161
162
+ build-axlearn :
163
+ needs : build-jax
164
+ uses : ./.github/workflows/_build.yaml
165
+ with :
166
+ ARCHITECTURE : ${{ inputs.ARCHITECTURE }}
167
+ ARTIFACT_NAME : artifact-axlearn-build
168
+ BADGE_FILENAME : badge-axlearn-build
169
+ BUILD_DATE : ${{ inputs.BUILD_DATE }}
170
+ BASE_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
171
+ CONTAINER_NAME : axlearn
172
+ DOCKERFILE : .github/container/Dockerfile.axlearn
173
+ RUNNER_SIZE : large
174
+ secrets : inherit
175
+
179
176
collect-docker-tags :
180
177
runs-on : ubuntu-22.04
181
- if : " !cancelled()"
178
+ if : ${{ !cancelled() }}
182
179
needs :
183
180
- build-base
184
181
- build-jax
@@ -189,6 +186,7 @@ jobs:
189
186
- build-upstream-t5x
190
187
- build-rosetta-t5x
191
188
- build-gemma
189
+ - build-axlearn
192
190
outputs :
193
191
TAGS : ${{ steps.collect-tags.outputs.TAGS }}
194
192
@@ -198,10 +196,24 @@ jobs:
198
196
run : |
199
197
TAGS=$(cat <<EOF | jq -c
200
198
[\
201
- {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
202
- {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
203
- {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
204
- {}\
199
+ {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
200
+ {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
201
+ {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
202
+ {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
203
+ {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
204
+ {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
205
+ {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
206
+ {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
207
+ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
208
+ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
209
+ {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
210
+ {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
211
+ {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
212
+ {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
213
+ {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
214
+ {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
215
+ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
216
+ {}\
205
217
]
206
218
EOF
207
219
)
@@ -399,9 +411,8 @@ jobs:
399
411
runs-on : eks
400
412
env :
401
413
JAX_DOCKER_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
402
- JOB_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-jax
403
- POSTPROCESS_JOB_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
404
- TOKEN_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-token
414
+ JOB_NAME : ${{ github.run_id }}-nsys-jax
415
+ POSTPROCESS_JOB_NAME : ${{ github.run_id }}-nsys-jax-postprocess
405
416
steps :
406
417
- name : Check out the repository
407
418
uses : actions/checkout@v4
@@ -411,59 +422,37 @@ jobs:
411
422
registry : ghcr.io
412
423
username : ${{ github.repository_owner }}
413
424
password : ${{ secrets.GITHUB_TOKEN }}
414
- - name : Store GitHub Container Registry token as Kubernetes secret
415
- run : |
416
- kubectl create secret generic \
417
- ${{ github.run_id }}-${{ github.run_attempt }}-token \
418
- --from-file=.dockerconfigjson=$HOME/.docker/config.json \
419
- --type=kubernetes.io/dockerconfigjson
425
+ - name : K8s GHCR store and delete token
426
+ id : store-token
427
+ uses : ./.github/actions/store-delete-k8s-ghcr
420
428
- name : Configure Kubernetes job
421
429
run : |
422
430
yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
423
431
| select(di == 1).metadata.name = strenv(JOB_NAME)
424
- | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
432
+ | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
425
433
| select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
426
434
| select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
427
435
.github/eks-workflow-files/job.yml
428
436
git diff .github/eks-workflow-files/job.yml
429
437
- name : Submit Kubernetes job
430
- run : kubectl apply -f .github/eks-workflow-files/job.yml
431
- - name : Wait for Kubernetes job to start
432
- run : |
433
- while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
434
- sleep 2
435
- done
436
- - name : Stream Kubernetes job output
437
- run : kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
438
- # Clean up in case of errors as well as success
439
- - name : Delete Kubernetes job
440
- if : always()
441
- run : kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
438
+ uses : ./.github/actions/submit-delete-k8s-job
439
+ with :
440
+ job-config-file : .github/eks-workflow-files/job.yml
441
+ job-name : ${{ env.JOB_NAME }}
442
442
- name : Configure post-processing job
443
443
run : |
444
444
export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
445
445
yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
446
446
| .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
447
- | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
447
+ | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
448
448
| .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
449
449
.github/eks-workflow-files/post-process-job.yml
450
450
git diff .github/eks-workflow-files/post-process-job.yml
451
- - name : Submit post-processing Kubernetes job
452
- run : kubectl apply -f .github/eks-workflow-files/post-process-job.yml
453
- - name : Wait for post-processing Kubernetes job to start
454
- run : |
455
- while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
456
- sleep 2
457
- done
458
- - name : Stream post-processing Kubernetes job output
459
- run : kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
460
- # Clean up in case of errors as well as success
461
- - name : Delete post-processing Kubernetes job
462
- if : always()
463
- run : kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
464
- - name : Delete GitHub Container Registry token
465
- if : always()
466
- run : kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
451
+ - name : Submit post process Kubernetes job
452
+ uses : ./.github/actions/submit-delete-k8s-job
453
+ with :
454
+ job-config-file : .github/eks-workflow-files/post-process-job.yml
455
+ job-name : ${{ env.POSTPROCESS_JOB_NAME }}
467
456
468
457
# test-equinox:
469
458
# needs: build-equinox
@@ -663,3 +652,126 @@ jobs:
663
652
with :
664
653
MAXTEXT_IMAGE : ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
665
654
secrets : inherit
655
+
656
+ test-axlearn-eks :
657
+ needs : build-axlearn
658
+ if : inputs.ARCHITECTURE == 'amd64'
659
+ runs-on : eks
660
+ env :
661
+ AXLEARN_DOCKER_IMAGE : ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
662
+ JOB_NAME : axlearn-${{ github.run_id }}
663
+ steps :
664
+ - name : Check out the repository
665
+ uses : actions/checkout@v4
666
+ - name : Login to GitHub Container Registry
667
+ uses : docker/login-action@v3
668
+ with :
669
+ registry : ghcr.io
670
+ username : ${{ github.repository_owner }}
671
+ password : ${{ secrets.GITHUB_TOKEN }}
672
+ - name : K8s GHCR store and delete token
673
+ id : store-token
674
+ uses : ./.github/actions/store-delete-k8s-ghcr
675
+ - name : Configure axlearn test job
676
+ run : |
677
+ # Replace placeholders in axlearn-job.yml with environment variables
678
+ yq -i ea '
679
+ select(di == 0).metadata.name = strenv(JOB_NAME)
680
+ | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
681
+ | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
682
+ | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
683
+ .github/eks-workflow-files/axlearn/axlearn-job.yml
684
+ git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
685
+ - name : Submit & delete axlearn test
686
+ uses : ./.github/actions/submit-delete-k8s-job
687
+ with :
688
+ job-config-file : " .github/eks-workflow-files/axlearn/axlearn-job.yml"
689
+ job-name : ${{ env.JOB_NAME }}
690
+ - name : Download logs from S3
691
+ id : log-s3
692
+ run : |
693
+ mkdir -p axlearn-output
694
+ aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
695
+ aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log"
696
+ passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
697
+ failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)
698
+ total_tests=$((failed_tests + passed_tests))
699
+ echo "Passed tests: $passed_tests"
700
+ echo "Failed tests: $failed_tests"
701
+ echo "Total tests: $total_tests"
702
+ echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
703
+ echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
704
+ echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
705
+ - name : Generate sitrep
706
+ id : sitrep
707
+ if : ${{ !cancelled() }}
708
+ shell : bash -x -e {0}
709
+ run : |
710
+ # bring in utility functions
711
+ source .github/workflows/scripts/to_json.sh
712
+ badge_label='Axlearn EKS Unit'
713
+ total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
714
+ failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
715
+ passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
716
+ errors="0" \
717
+ summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
718
+ badge_message="Passed $passed_tests out of $total_tests." \
719
+ badge_color="brightgreen"
720
+ if [ "$failed_tests" -gt 0 ]; then
721
+ badge_color="red"
722
+ fi \
723
+ to_json \
724
+ summary \
725
+ errors total_tests passed_tests failed_tests \
726
+ badge_label badge_color badge_message \
727
+ > sitrep.json
728
+ schemaVersion=1 \
729
+ label="${badge_label}" \
730
+ message="Passed $passed_tests out of $total_tests." \
731
+ color=$badge_color \
732
+ to_json schemaVersion label message color \
733
+ > badge-axlearn-test.json
734
+ - name : Upload artifacts
735
+ if : ${{ !cancelled() }}
736
+ uses : actions/upload-artifact@v4
737
+ with :
738
+ name : " artifact-axlearn-test"
739
+ path : |
740
+ sitrep.json
741
+ badge-axlearn-test.json
742
+ axlearn-output/*
743
+ # the fuji test will run for 20 minutes only, as per 2025-02-24
744
+ # is not possible to set the `max_steps` value
745
+ # this will be done with a customer python code
746
+ test-axlearn-fuji-models-eks :
747
+ needs : build-axlearn
748
+ if : inputs.ARCHITECTURE == 'amd64'
749
+ runs-on : eks
750
+ env :
751
+ AXLEARN_DOCKER_IMAGE : ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
752
+ JOB_NAME : axlearn-fuji-3b-${{ github.run_id }}
753
+ steps :
754
+ - name : Check out the repository
755
+ uses : actions/checkout@v4
756
+ - name : Login to GitHub Container Registry
757
+ uses : docker/login-action@v3
758
+ with :
759
+ registry : ghcr.io
760
+ username : ${{ github.repository_owner }}
761
+ password : ${{ secrets.GITHUB_TOKEN }}
762
+ - name : K8s GHCR store and delete token
763
+ id : store-token
764
+ uses : ./.github/actions/store-delete-k8s-ghcr
765
+ - name : Configure axlearn test job
766
+ run : |
767
+ yq -i ea '
768
+ select(di == 0).metadata.name = strenv(JOB_NAME)
769
+ | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
770
+ | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
771
+ .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
772
+ git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
773
+ - name : Submit & delete axlearn test
774
+ uses : ./.github/actions/submit-delete-k8s-job
775
+ with :
776
+ job-config-file : " .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
777
+ job-name : ${{ env.JOB_NAME }}
0 commit comments