66
66
URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
67
67
secrets : inherit
68
68
69
- build-triton :
70
- needs : build-jax
71
- if : inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
72
- uses : ./.github/workflows/_build.yaml
73
- with :
74
- ARCHITECTURE : ${{ inputs.ARCHITECTURE }}
75
- ARTIFACT_NAME : artifact-triton-build
76
- BADGE_FILENAME : badge-triton-build
77
- BUILD_DATE : ${{ inputs.BUILD_DATE }}
78
- BASE_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
79
- CONTAINER_NAME : triton
80
- DOCKERFILE : .github/container/Dockerfile.triton
81
- RUNNER_SIZE : large
82
- EXTRA_BUILD_ARGS : |
83
- URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
84
- secrets : inherit
85
-
86
69
build-equinox :
87
70
needs : build-jax
88
71
uses : ./.github/workflows/_build.yaml
@@ -176,9 +159,23 @@ jobs:
176
159
URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
177
160
secrets : inherit
178
161
162
+ build-axlearn :
163
+ needs : build-jax
164
+ uses : ./.github/workflows/_build.yaml
165
+ with :
166
+ ARCHITECTURE : ${{ inputs.ARCHITECTURE }}
167
+ ARTIFACT_NAME : artifact-axlearn-build
168
+ BADGE_FILENAME : badge-axlearn-build
169
+ BUILD_DATE : ${{ inputs.BUILD_DATE }}
170
+ BASE_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
171
+ CONTAINER_NAME : axlearn
172
+ DOCKERFILE : .github/container/Dockerfile.axlearn
173
+ RUNNER_SIZE : large
174
+ secrets : inherit
175
+
179
176
collect-docker-tags :
180
177
runs-on : ubuntu-22.04
181
- if : " !cancelled()"
178
+ if : ${{ !cancelled() }}
182
179
needs :
183
180
- build-base
184
181
- build-jax
@@ -198,10 +195,23 @@ jobs:
198
195
run : |
199
196
TAGS=$(cat <<EOF | jq -c
200
197
[\
201
- {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
202
- {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
203
- {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
204
- {}\
198
+ {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
199
+ {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
200
+ {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
201
+ {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
202
+ {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
203
+ {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
204
+ {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
205
+ {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
206
+ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
207
+ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
208
+ {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
209
+ {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
210
+ {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
211
+ {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
212
+ {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
213
+ {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
214
+ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
205
215
]
206
216
EOF
207
217
)
@@ -396,74 +406,51 @@ jobs:
396
406
test-nsys-jax-eks :
397
407
needs : build-jax
398
408
if : inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
399
- runs-on : eks
400
- env :
401
- JAX_DOCKER_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
402
- JOB_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-jax
403
- POSTPROCESS_JOB_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
404
- TOKEN_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-token
405
- steps :
406
- - name : Check out the repository
407
- uses : actions/checkout@v4
409
+ runs-on : eks
410
+ env :
411
+ JAX_DOCKER_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
412
+ JOB_NAME : ${{ github.run_id }}-nsys-jax
413
+ POSTPROCESS_JOB_NAME : ${{ github.run_id }}-nsys-jax-postprocess
414
+ steps :
415
+ - name : Check out the repository
416
+ uses : actions/checkout@v4
408
417
- name : Login to GitHub Container Registry
409
418
uses : docker/login-action@v3
410
419
with :
411
- registry : ghcr.io
412
- username : ${{ github.repository_owner }}
413
- password : ${{ secrets.GITHUB_TOKEN }}
414
- - name : Store GitHub Container Registry token as Kubernetes secret
415
- run : |
416
- kubectl create secret generic \
417
- ${{ github.run_id }}-${{ github.run_attempt }}-token \
418
- --from-file=.dockerconfigjson=$HOME/.docker/config.json \
419
- --type=kubernetes.io/dockerconfigjson
420
- - name : Configure Kubernetes job
421
- run : |
422
- yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
423
- | select(di == 1).metadata.name = strenv(JOB_NAME)
424
- | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
425
- | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
426
- | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
427
- .github/eks-workflow-files/job.yml
428
- git diff .github/eks-workflow-files/job.yml
429
- - name : Submit Kubernetes job
430
- run : kubectl apply -f .github/eks-workflow-files/job.yml
431
- - name : Wait for Kubernetes job to start
432
- run : |
433
- while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
434
- sleep 2
435
- done
436
- - name : Stream Kubernetes job output
437
- run : kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
438
- # Clean up in case of errors as well as success
439
- - name : Delete Kubernetes job
440
- if : always()
441
- run : kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
442
- - name : Configure post-processing job
443
- run : |
444
- export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
445
- yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
446
- | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
447
- | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
448
- | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
449
- .github/eks-workflow-files/post-process-job.yml
450
- git diff .github/eks-workflow-files/post-process-job.yml
451
- - name : Submit post-processing Kubernetes job
452
- run : kubectl apply -f .github/eks-workflow-files/post-process-job.yml
453
- - name : Wait for post-processing Kubernetes job to start
454
- run : |
455
- while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
456
- sleep 2
457
- done
458
- - name : Stream post-processing Kubernetes job output
459
- run : kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
460
- # Clean up in case of errors as well as success
461
- - name : Delete post-processing Kubernetes job
462
- if : always()
463
- run : kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
464
- - name : Delete GitHub Container Registry token
465
- if : always()
466
- run : kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
420
+ registry : ghcr.io
421
+ username : ${{ github.repository_owner }}
422
+ password : ${{ secrets.GITHUB_TOKEN }}
423
+ - name : K8s GHCR store and delete token
424
+ id : store-token
425
+ uses : ./.github/actions/store-delete-k8s-ghcr
426
+ - name : Configure Kubernetes job
427
+ run : |
428
+ yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
429
+ | select(di == 1).metadata.name = strenv(JOB_NAME)
430
+ | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
431
+ | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
432
+ | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
433
+ .github/eks-workflow-files/job.yml
434
+ git diff .github/eks-workflow-files/job.yml
435
+ - name : Submit Kubernetes job
436
+ uses : ./.github/actions/submit-delete-k8s-job
437
+ with :
438
+ job-config-file : .github/eks-workflow-files/job.yml
439
+ job-name : ${{ env.JOB_NAME }}
440
+ - name : Configure post-processing job
441
+ run : |
442
+ export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
443
+ yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
444
+ | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
445
+ | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
446
+ | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
447
+ .github/eks-workflow-files/post-process-job.yml
448
+ git diff .github/eks-workflow-files/post-process-job.yml
449
+ - name : Submit post process Kubernetes job
450
+ uses : ./.github/actions/submit-delete-k8s-job
451
+ with :
452
+ job-config-file : .github/eks-workflow-files/post-process-job.yml
453
+ job-name : ${{ env.POSTPROCESS_JOB_NAME }}
467
454
468
455
# test-equinox:
469
456
# needs: build-equinox
@@ -663,3 +650,126 @@ jobs:
663
650
with :
664
651
MAXTEXT_IMAGE : ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
665
652
secrets : inherit
653
+
654
+ test-axlearn-eks :
655
+ needs : build-axlearn
656
+ if : inputs.ARCHITECTURE == 'amd64'
657
+ runs-on : eks
658
+ env :
659
+ AXLEARN_DOCKER_IMAGE : ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
660
+ JOB_NAME : axlearn-${{ github.run_id }}
661
+ steps :
662
+ - name : Check out the repository
663
+ uses : actions/checkout@v4
664
+ - name : Login to GitHub Container Registry
665
+ uses : docker/login-action@v3
666
+ with :
667
+ registry : ghcr.io
668
+ username : ${{ github.repository_owner }}
669
+ password : ${{ secrets.GITHUB_TOKEN }}
670
+ - name : K8s GHCR store and delete token
671
+ id : store-token
672
+ uses : ./.github/actions/store-delete-k8s-ghcr
673
+ - name : Configure axlearn test job
674
+ run : |
675
+ # Replace placeholders in axlearn-job.yml with environment variables
676
+ yq -i ea '
677
+ select(di == 0).metadata.name = strenv(JOB_NAME)
678
+ | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
679
+ | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
680
+ | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
681
+ .github/eks-workflow-files/axlearn/axlearn-job.yml
682
+ git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
683
+ - name : Submit & delete axlearn test
684
+ uses : ./.github/actions/submit-delete-k8s-job
685
+ with :
686
+ job-config-file : " .github/eks-workflow-files/axlearn/axlearn-job.yml"
687
+ job-name : ${{ env.JOB_NAME }}
688
+ - name : Download logs from S3
689
+ id : log-s3
690
+ run : |
691
+ mkdir -p axlearn-output
692
+ aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
693
+ aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log"
694
+ passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
695
+ failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)
696
+ total_tests=$((failed_tests + passed_tests))
697
+ echo "Passed tests: $passed_tests"
698
+ echo "Failed tests: $failed_tests"
699
+ echo "Total tests: $total_tests"
700
+ echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
701
+ echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
702
+ echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
703
+ - name : Generate sitrep
704
+ id : sitrep
705
+ if : ${{ !cancelled() }}
706
+ shell : bash -x -e {0}
707
+ run : |
708
+ # bring in utility functions
709
+ source .github/workflows/scripts/to_json.sh
710
+ badge_label='Axlearn EKS Unit'
711
+ total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
712
+ failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
713
+ passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
714
+ errors="0" \
715
+ summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
716
+ badge_message="Passed $passed_tests out of $total_tests." \
717
+ badge_color="brightgreen"
718
+ if [ "$failed_tests" -gt 0 ]; then
719
+ badge_color="red"
720
+ fi \
721
+ to_json \
722
+ summary \
723
+ errors total_tests passed_tests failed_tests \
724
+ badge_label badge_color badge_message \
725
+ > sitrep.json
726
+ schemaVersion=1 \
727
+ label="${badge_label}" \
728
+ message="Passed $passed_tests out of $total_tests." \
729
+ color=$badge_color \
730
+ to_json schemaVersion label message color \
731
+ > badge-axlearn-test.json
732
+ - name : Upload artifacts
733
+ if : ${{ !cancelled() }}
734
+ uses : actions/upload-artifact@v4
735
+ with :
736
+ name : " artifact-axlearn-test"
737
+ path : |
738
+ sitrep.json
739
+ badge-axlearn-test.json
740
+ axlearn-output/*
741
+ # the fuji test will run for 20 minutes only, as per 2025-02-24
742
+ # is not possible to set the `max_steps` value
743
+ # this will be done with a customer python code
744
+ test-axlearn-fuji-models-eks :
745
+ needs : build-axlearn
746
+ if : inputs.ARCHITECTURE == 'amd64'
747
+ runs-on : eks
748
+ env :
749
+ AXLEARN_DOCKER_IMAGE : ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
750
+ JOB_NAME : axlearn-fuji-3b-${{ github.run_id }}
751
+ steps :
752
+ - name : Check out the repository
753
+ uses : actions/checkout@v4
754
+ - name : Login to GitHub Container Registry
755
+ uses : docker/login-action@v3
756
+ with :
757
+ registry : ghcr.io
758
+ username : ${{ github.repository_owner }}
759
+ password : ${{ secrets.GITHUB_TOKEN }}
760
+ - name : K8s GHCR store and delete token
761
+ id : store-token
762
+ uses : ./.github/actions/store-delete-k8s-ghcr
763
+ - name : Configure axlearn test job
764
+ run : |
765
+ yq -i ea '
766
+ select(di == 0).metadata.name = strenv(JOB_NAME)
767
+ | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
768
+ | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
769
+ .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
770
+ git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
771
+ - name : Submit & delete axlearn test
772
+ uses : ./.github/actions/submit-delete-k8s-job
773
+ with :
774
+ job-config-file : " .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
775
+ job-name : ${{ env.JOB_NAME }}
0 commit comments