Skip to content

Commit 00ff715

Browse files
committed
Merge main
1 parent 85e00ce commit 00ff715

File tree

1 file changed

+197
-87
lines changed

1 file changed

+197
-87
lines changed

.github/workflows/_ci.yaml

+197-87
Original file line numberDiff line numberDiff line change
@@ -66,23 +66,6 @@ jobs:
6666
URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
6767
secrets: inherit
6868

69-
build-triton:
70-
needs: build-jax
71-
if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
72-
uses: ./.github/workflows/_build.yaml
73-
with:
74-
ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
75-
ARTIFACT_NAME: artifact-triton-build
76-
BADGE_FILENAME: badge-triton-build
77-
BUILD_DATE: ${{ inputs.BUILD_DATE }}
78-
BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
79-
CONTAINER_NAME: triton
80-
DOCKERFILE: .github/container/Dockerfile.triton
81-
RUNNER_SIZE: large
82-
EXTRA_BUILD_ARGS: |
83-
URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
84-
secrets: inherit
85-
8669
build-equinox:
8770
needs: build-jax
8871
uses: ./.github/workflows/_build.yaml
@@ -176,9 +159,23 @@ jobs:
176159
URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
177160
secrets: inherit
178161

162+
build-axlearn:
163+
needs: build-jax
164+
uses: ./.github/workflows/_build.yaml
165+
with:
166+
ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
167+
ARTIFACT_NAME: artifact-axlearn-build
168+
BADGE_FILENAME: badge-axlearn-build
169+
BUILD_DATE: ${{ inputs.BUILD_DATE }}
170+
BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
171+
CONTAINER_NAME: axlearn
172+
DOCKERFILE: .github/container/Dockerfile.axlearn
173+
RUNNER_SIZE: large
174+
secrets: inherit
175+
179176
collect-docker-tags:
180177
runs-on: ubuntu-22.04
181-
if: "!cancelled()"
178+
if: ${{ !cancelled() }}
182179
needs:
183180
- build-base
184181
- build-jax
@@ -198,10 +195,23 @@ jobs:
198195
run: |
199196
TAGS=$(cat <<EOF | jq -c
200197
[\
201-
{"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
202-
{"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
203-
{"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
204-
{}\
198+
{"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
199+
{"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
200+
{"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
201+
{"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
202+
{"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
203+
{"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
204+
{"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
205+
{"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
206+
{"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
207+
{"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
208+
{"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
209+
{"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
210+
{"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
211+
{"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
212+
{"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
213+
{"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
214+
{"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
205215
]
206216
EOF
207217
)
@@ -396,74 +406,51 @@ jobs:
396406
test-nsys-jax-eks:
397407
needs: build-jax
398408
if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
399-
runs-on: eks
400-
env:
401-
JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
402-
JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax
403-
POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
404-
TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
405-
steps:
406-
- name: Check out the repository
407-
uses: actions/checkout@v4
409+
runs-on: eks
410+
env:
411+
JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
412+
JOB_NAME: ${{ github.run_id }}-nsys-jax
413+
POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
414+
steps:
415+
- name: Check out the repository
416+
uses: actions/checkout@v4
408417
- name: Login to GitHub Container Registry
409418
uses: docker/login-action@v3
410419
with:
411-
registry: ghcr.io
412-
username: ${{ github.repository_owner }}
413-
password: ${{ secrets.GITHUB_TOKEN }}
414-
- name: Store GitHub Container Registry token as Kubernetes secret
415-
run: |
416-
kubectl create secret generic \
417-
${{ github.run_id }}-${{ github.run_attempt }}-token \
418-
--from-file=.dockerconfigjson=$HOME/.docker/config.json \
419-
--type=kubernetes.io/dockerconfigjson
420-
- name: Configure Kubernetes job
421-
run: |
422-
yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
423-
| select(di == 1).metadata.name = strenv(JOB_NAME)
424-
| select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
425-
| select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
426-
| select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
427-
.github/eks-workflow-files/job.yml
428-
git diff .github/eks-workflow-files/job.yml
429-
- name: Submit Kubernetes job
430-
run: kubectl apply -f .github/eks-workflow-files/job.yml
431-
- name: Wait for Kubernetes job to start
432-
run: |
433-
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
434-
sleep 2
435-
done
436-
- name: Stream Kubernetes job output
437-
run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
438-
# Clean up in case of errors as well as success
439-
- name: Delete Kubernetes job
440-
if: always()
441-
run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
442-
- name: Configure post-processing job
443-
run: |
444-
export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
445-
yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
446-
| .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
447-
| .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
448-
| .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
449-
.github/eks-workflow-files/post-process-job.yml
450-
git diff .github/eks-workflow-files/post-process-job.yml
451-
- name: Submit post-processing Kubernetes job
452-
run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml
453-
- name: Wait for post-processing Kubernetes job to start
454-
run: |
455-
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
456-
sleep 2
457-
done
458-
- name: Stream post-processing Kubernetes job output
459-
run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
460-
# Clean up in case of errors as well as success
461-
- name: Delete post-processing Kubernetes job
462-
if: always()
463-
run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
464-
- name: Delete GitHub Container Registry token
465-
if: always()
466-
run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
420+
registry: ghcr.io
421+
username: ${{ github.repository_owner }}
422+
password: ${{ secrets.GITHUB_TOKEN }}
423+
- name: K8s GHCR store and delete token
424+
id: store-token
425+
uses: ./.github/actions/store-delete-k8s-ghcr
426+
- name: Configure Kubernetes job
427+
run: |
428+
yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
429+
| select(di == 1).metadata.name = strenv(JOB_NAME)
430+
| select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
431+
| select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
432+
| select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
433+
.github/eks-workflow-files/job.yml
434+
git diff .github/eks-workflow-files/job.yml
435+
- name: Submit Kubernetes job
436+
uses: ./.github/actions/submit-delete-k8s-job
437+
with:
438+
job-config-file: .github/eks-workflow-files/job.yml
439+
job-name: ${{ env.JOB_NAME }}
440+
- name: Configure post-processing job
441+
run: |
442+
export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
443+
yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
444+
| .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
445+
| .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
446+
| .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
447+
.github/eks-workflow-files/post-process-job.yml
448+
git diff .github/eks-workflow-files/post-process-job.yml
449+
- name: Submit post process Kubernetes job
450+
uses: ./.github/actions/submit-delete-k8s-job
451+
with:
452+
job-config-file: .github/eks-workflow-files/post-process-job.yml
453+
job-name: ${{ env.POSTPROCESS_JOB_NAME }}
467454

468455
# test-equinox:
469456
# needs: build-equinox
@@ -663,3 +650,126 @@ jobs:
663650
with:
664651
MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
665652
secrets: inherit
653+
654+
test-axlearn-eks:
655+
needs: build-axlearn
656+
if: inputs.ARCHITECTURE == 'amd64'
657+
runs-on: eks
658+
env:
659+
AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
660+
JOB_NAME: axlearn-${{ github.run_id }}
661+
steps:
662+
- name: Check out the repository
663+
uses: actions/checkout@v4
664+
- name: Login to GitHub Container Registry
665+
uses: docker/login-action@v3
666+
with:
667+
registry: ghcr.io
668+
username: ${{ github.repository_owner }}
669+
password: ${{ secrets.GITHUB_TOKEN }}
670+
- name: K8s GHCR store and delete token
671+
id: store-token
672+
uses: ./.github/actions/store-delete-k8s-ghcr
673+
- name: Configure axlearn test job
674+
run: |
675+
# Replace placeholders in axlearn-job.yml with environment variables
676+
yq -i ea '
677+
select(di == 0).metadata.name = strenv(JOB_NAME)
678+
| select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
679+
| select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
680+
| select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
681+
.github/eks-workflow-files/axlearn/axlearn-job.yml
682+
git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
683+
- name: Submit & delete axlearn test
684+
uses: ./.github/actions/submit-delete-k8s-job
685+
with:
686+
job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml"
687+
job-name: ${{ env.JOB_NAME }}
688+
- name: Download logs from S3
689+
id: log-s3
690+
run: |
691+
mkdir -p axlearn-output
692+
aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
693+
aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log"
694+
passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
695+
failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)
696+
total_tests=$((failed_tests + passed_tests))
697+
echo "Passed tests: $passed_tests"
698+
echo "Failed tests: $failed_tests"
699+
echo "Total tests: $total_tests"
700+
echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
701+
echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
702+
echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
703+
- name: Generate sitrep
704+
id: sitrep
705+
if: ${{ !cancelled() }}
706+
shell: bash -x -e {0}
707+
run: |
708+
# bring in utility functions
709+
source .github/workflows/scripts/to_json.sh
710+
badge_label='Axlearn EKS Unit'
711+
total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
712+
failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
713+
passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
714+
errors="0" \
715+
summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
716+
badge_message="Passed $passed_tests out of $total_tests." \
717+
badge_color="brightgreen"
718+
if [ "$failed_tests" -gt 0 ]; then
719+
badge_color="red"
720+
fi \
721+
to_json \
722+
summary \
723+
errors total_tests passed_tests failed_tests \
724+
badge_label badge_color badge_message \
725+
> sitrep.json
726+
schemaVersion=1 \
727+
label="${badge_label}" \
728+
message="Passed $passed_tests out of $total_tests." \
729+
color=$badge_color \
730+
to_json schemaVersion label message color \
731+
> badge-axlearn-test.json
732+
- name: Upload artifacts
733+
if: ${{ !cancelled() }}
734+
uses: actions/upload-artifact@v4
735+
with:
736+
name: "artifact-axlearn-test"
737+
path: |
738+
sitrep.json
739+
badge-axlearn-test.json
740+
axlearn-output/*
741+
# the fuji test will run for 20 minutes only, as per 2025-02-24
742+
# is not possible to set the `max_steps` value
743+
# this will be done with a customer python code
744+
test-axlearn-fuji-models-eks:
745+
needs: build-axlearn
746+
if: inputs.ARCHITECTURE == 'amd64'
747+
runs-on: eks
748+
env:
749+
AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
750+
JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
751+
steps:
752+
- name: Check out the repository
753+
uses: actions/checkout@v4
754+
- name: Login to GitHub Container Registry
755+
uses: docker/login-action@v3
756+
with:
757+
registry: ghcr.io
758+
username: ${{ github.repository_owner }}
759+
password: ${{ secrets.GITHUB_TOKEN }}
760+
- name: K8s GHCR store and delete token
761+
id: store-token
762+
uses: ./.github/actions/store-delete-k8s-ghcr
763+
- name: Configure axlearn test job
764+
run: |
765+
yq -i ea '
766+
select(di == 0).metadata.name = strenv(JOB_NAME)
767+
| select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
768+
| select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
769+
.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
770+
git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
771+
- name: Submit & delete axlearn test
772+
uses: ./.github/actions/submit-delete-k8s-job
773+
with:
774+
job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
775+
job-name: ${{ env.JOB_NAME }}

0 commit comments

Comments
 (0)