Skip to content

Commit a93b216

Browse files
committed
Merge main
1 parent 85e00ce commit a93b216

File tree

1 file changed

+173
-61
lines changed

1 file changed

+173
-61
lines changed

.github/workflows/_ci.yaml

+173-61
Original file line numberDiff line numberDiff line change
@@ -66,23 +66,6 @@ jobs:
6666
URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
6767
secrets: inherit
6868

69-
build-triton:
70-
needs: build-jax
71-
if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
72-
uses: ./.github/workflows/_build.yaml
73-
with:
74-
ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
75-
ARTIFACT_NAME: artifact-triton-build
76-
BADGE_FILENAME: badge-triton-build
77-
BUILD_DATE: ${{ inputs.BUILD_DATE }}
78-
BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
79-
CONTAINER_NAME: triton
80-
DOCKERFILE: .github/container/Dockerfile.triton
81-
RUNNER_SIZE: large
82-
EXTRA_BUILD_ARGS: |
83-
URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
84-
secrets: inherit
85-
8669
build-equinox:
8770
needs: build-jax
8871
uses: ./.github/workflows/_build.yaml
@@ -176,9 +159,23 @@ jobs:
176159
URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
177160
secrets: inherit
178161

162+
build-axlearn:
163+
needs: build-jax
164+
uses: ./.github/workflows/_build.yaml
165+
with:
166+
ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
167+
ARTIFACT_NAME: artifact-axlearn-build
168+
BADGE_FILENAME: badge-axlearn-build
169+
BUILD_DATE: ${{ inputs.BUILD_DATE }}
170+
BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
171+
CONTAINER_NAME: axlearn
172+
DOCKERFILE: .github/container/Dockerfile.axlearn
173+
RUNNER_SIZE: large
174+
secrets: inherit
175+
179176
collect-docker-tags:
180177
runs-on: ubuntu-22.04
181-
if: "!cancelled()"
178+
if: ${{ !cancelled() }}
182179
needs:
183180
- build-base
184181
- build-jax
@@ -189,6 +186,7 @@ jobs:
189186
- build-upstream-t5x
190187
- build-rosetta-t5x
191188
- build-gemma
189+
- build-axlearn
192190
outputs:
193191
TAGS: ${{ steps.collect-tags.outputs.TAGS }}
194192

@@ -198,10 +196,24 @@ jobs:
198196
run: |
199197
TAGS=$(cat <<EOF | jq -c
200198
[\
201-
{"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
202-
{"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
203-
{"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
204-
{}\
199+
{"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
200+
{"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
201+
{"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
202+
{"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
203+
{"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
204+
{"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
205+
{"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
206+
{"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
207+
{"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
208+
{"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
209+
{"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
210+
{"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
211+
{"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
212+
{"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
213+
{"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
214+
{"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
215+
{"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
216+
{}\
205217
]
206218
EOF
207219
)
@@ -399,9 +411,8 @@ jobs:
399411
runs-on: eks
400412
env:
401413
JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
402-
JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax
403-
POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
404-
TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
414+
JOB_NAME: ${{ github.run_id }}-nsys-jax
415+
POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
405416
steps:
406417
- name: Check out the repository
407418
uses: actions/checkout@v4
@@ -411,59 +422,37 @@ jobs:
411422
registry: ghcr.io
412423
username: ${{ github.repository_owner }}
413424
password: ${{ secrets.GITHUB_TOKEN }}
414-
- name: Store GitHub Container Registry token as Kubernetes secret
415-
run: |
416-
kubectl create secret generic \
417-
${{ github.run_id }}-${{ github.run_attempt }}-token \
418-
--from-file=.dockerconfigjson=$HOME/.docker/config.json \
419-
--type=kubernetes.io/dockerconfigjson
425+
- name: K8s GHCR store and delete token
426+
id: store-token
427+
uses: ./.github/actions/store-delete-k8s-ghcr
420428
- name: Configure Kubernetes job
421429
run: |
422430
yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
423431
| select(di == 1).metadata.name = strenv(JOB_NAME)
424-
| select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
432+
| select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
425433
| select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
426434
| select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
427435
.github/eks-workflow-files/job.yml
428436
git diff .github/eks-workflow-files/job.yml
429437
- name: Submit Kubernetes job
430-
run: kubectl apply -f .github/eks-workflow-files/job.yml
431-
- name: Wait for Kubernetes job to start
432-
run: |
433-
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
434-
sleep 2
435-
done
436-
- name: Stream Kubernetes job output
437-
run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
438-
# Clean up in case of errors as well as success
439-
- name: Delete Kubernetes job
440-
if: always()
441-
run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
438+
uses: ./.github/actions/submit-delete-k8s-job
439+
with:
440+
job-config-file: .github/eks-workflow-files/job.yml
441+
job-name: ${{ env.JOB_NAME }}
442442
- name: Configure post-processing job
443443
run: |
444444
export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
445445
yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
446446
| .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
447-
| .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
447+
| .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
448448
| .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
449449
.github/eks-workflow-files/post-process-job.yml
450450
git diff .github/eks-workflow-files/post-process-job.yml
451-
- name: Submit post-processing Kubernetes job
452-
run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml
453-
- name: Wait for post-processing Kubernetes job to start
454-
run: |
455-
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
456-
sleep 2
457-
done
458-
- name: Stream post-processing Kubernetes job output
459-
run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
460-
# Clean up in case of errors as well as success
461-
- name: Delete post-processing Kubernetes job
462-
if: always()
463-
run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
464-
- name: Delete GitHub Container Registry token
465-
if: always()
466-
run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
451+
- name: Submit post process Kubernetes job
452+
uses: ./.github/actions/submit-delete-k8s-job
453+
with:
454+
job-config-file: .github/eks-workflow-files/post-process-job.yml
455+
job-name: ${{ env.POSTPROCESS_JOB_NAME }}
467456

468457
# test-equinox:
469458
# needs: build-equinox
@@ -663,3 +652,126 @@ jobs:
663652
with:
664653
MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
665654
secrets: inherit
655+
656+
test-axlearn-eks:
657+
needs: build-axlearn
658+
if: inputs.ARCHITECTURE == 'amd64'
659+
runs-on: eks
660+
env:
661+
AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
662+
JOB_NAME: axlearn-${{ github.run_id }}
663+
steps:
664+
- name: Check out the repository
665+
uses: actions/checkout@v4
666+
- name: Login to GitHub Container Registry
667+
uses: docker/login-action@v3
668+
with:
669+
registry: ghcr.io
670+
username: ${{ github.repository_owner }}
671+
password: ${{ secrets.GITHUB_TOKEN }}
672+
- name: K8s GHCR store and delete token
673+
id: store-token
674+
uses: ./.github/actions/store-delete-k8s-ghcr
675+
- name: Configure axlearn test job
676+
run: |
677+
# Replace placeholders in axlearn-job.yml with environment variables
678+
yq -i ea '
679+
select(di == 0).metadata.name = strenv(JOB_NAME)
680+
| select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
681+
| select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
682+
| select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
683+
.github/eks-workflow-files/axlearn/axlearn-job.yml
684+
git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
685+
- name: Submit & delete axlearn test
686+
uses: ./.github/actions/submit-delete-k8s-job
687+
with:
688+
job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml"
689+
job-name: ${{ env.JOB_NAME }}
690+
- name: Download logs from S3
691+
id: log-s3
692+
run: |
693+
mkdir -p axlearn-output
694+
aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
695+
aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log"
696+
passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
697+
failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)
698+
total_tests=$((failed_tests + passed_tests))
699+
echo "Passed tests: $passed_tests"
700+
echo "Failed tests: $failed_tests"
701+
echo "Total tests: $total_tests"
702+
echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
703+
echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
704+
echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
705+
- name: Generate sitrep
706+
id: sitrep
707+
if: ${{ !cancelled() }}
708+
shell: bash -x -e {0}
709+
run: |
710+
# bring in utility functions
711+
source .github/workflows/scripts/to_json.sh
712+
badge_label='Axlearn EKS Unit'
713+
total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
714+
failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
715+
passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
716+
errors="0" \
717+
summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
718+
badge_message="Passed $passed_tests out of $total_tests." \
719+
badge_color="brightgreen"
720+
if [ "$failed_tests" -gt 0 ]; then
721+
badge_color="red"
722+
fi \
723+
to_json \
724+
summary \
725+
errors total_tests passed_tests failed_tests \
726+
badge_label badge_color badge_message \
727+
> sitrep.json
728+
schemaVersion=1 \
729+
label="${badge_label}" \
730+
message="Passed $passed_tests out of $total_tests." \
731+
color=$badge_color \
732+
to_json schemaVersion label message color \
733+
> badge-axlearn-test.json
734+
- name: Upload artifacts
735+
if: ${{ !cancelled() }}
736+
uses: actions/upload-artifact@v4
737+
with:
738+
name: "artifact-axlearn-test"
739+
path: |
740+
sitrep.json
741+
badge-axlearn-test.json
742+
axlearn-output/*
743+
# the fuji test will run for 20 minutes only, as per 2025-02-24
744+
# is not possible to set the `max_steps` value
745+
# this will be done with a customer python code
746+
test-axlearn-fuji-models-eks:
747+
needs: build-axlearn
748+
if: inputs.ARCHITECTURE == 'amd64'
749+
runs-on: eks
750+
env:
751+
AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
752+
JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
753+
steps:
754+
- name: Check out the repository
755+
uses: actions/checkout@v4
756+
- name: Login to GitHub Container Registry
757+
uses: docker/login-action@v3
758+
with:
759+
registry: ghcr.io
760+
username: ${{ github.repository_owner }}
761+
password: ${{ secrets.GITHUB_TOKEN }}
762+
- name: K8s GHCR store and delete token
763+
id: store-token
764+
uses: ./.github/actions/store-delete-k8s-ghcr
765+
- name: Configure axlearn test job
766+
run: |
767+
yq -i ea '
768+
select(di == 0).metadata.name = strenv(JOB_NAME)
769+
| select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
770+
| select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
771+
.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
772+
git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
773+
- name: Submit & delete axlearn test
774+
uses: ./.github/actions/submit-delete-k8s-job
775+
with:
776+
job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
777+
job-name: ${{ env.JOB_NAME }}

0 commit comments

Comments
 (0)