GoogleCloudPlatform · juanpabloguerra16 · Feb 18, 2025 · Feb 18, 2025 · Feb 20, 2025 · Feb 20, 2025
diff --git a/tutorials-and-examples/nvidia-bionemo/README.md b/tutorials-and-examples/nvidia-bionemo/README.md
@@ -2,8 +2,8 @@
 
 #### Pretraining ESM-2
 
-- [Pretraining ESM-2 LLM on GKE using BioNeMo Framework 2.0](./esm2/README.md#pretraining)
+- [Pretraining ESM-2 LLM on GKE using BioNeMo Framework 2.0](./pretraining/README.md)
 
 #### Fine-turning ESM-2
 
-- [Fine-tuning ESM-2 LLM on GKE using BioNeMo Framework 2.0](./esm2/README.md#fine-tuning)
+- [Fine-tuning ESM-2 LLM on GKE using BioNeMo Framework 2.0](./fine-tuning/README.md)
diff --git a/tutorials-and-examples/nvidia-bionemo/base/kustomization.yaml b/tutorials-and-examples/nvidia-bionemo/base/kustomization.yaml
@@ -0,0 +1,9 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+- namespace.yaml
+- storage/storage-class.yaml
+- storage/pvcs.yaml
+- monitoring/tensorboard-deployment.yaml
+- monitoring/tensorboard-service.yaml
+- monitoring/rbac.yaml
diff --git a/tutorials-and-examples/nvidia-bionemo/base/monitoring/rbac.yaml b/tutorials-and-examples/nvidia-bionemo/base/monitoring/rbac.yaml
@@ -0,0 +1,29 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: tensorboard-sa
+  namespace: bionemo-training
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: job-reader
+  namespace: bionemo-training
+rules:
+- apiGroups: ["batch"]
+  resources: ["jobs"]
+  verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: tensorboard-job-reader
+  namespace: bionemo-training
+subjects:
+- kind: ServiceAccount
+  name: tensorboard-sa
+  namespace: bionemo-training
+roleRef:
+  kind: Role
+  name: job-reader
+  apiGroup: rbac.authorization.k8s.io
diff --git a/tutorials-and-examples/nvidia-bionemo/base/monitoring/tensorboard-deployment.yaml b/tutorials-and-examples/nvidia-bionemo/base/monitoring/tensorboard-deployment.yaml
@@ -0,0 +1,33 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: tensorboard
+  namespace: bionemo-training
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: tensorboard
+  template:
+    metadata:
+      labels:
+        app: tensorboard
+    spec:
+      containers:
+      - name: tensorboard
+        image: tensorflow/tensorflow:latest
+        command:
+        - tensorboard
+        args:
+        - --logdir=/workspace/bionemo2/results/lightning_logs
+        - --port=6006
+        volumeMounts:
+        - name: bionemo-storage
+          mountPath: /workspace/bionemo2/results
+          subPath: tensorboard-logs
+          readOnly: true
+      volumes:
+      - name: bionemo-storage
+        persistentVolumeClaim:
+          claimName: bionemo-filestore
+      serviceAccountName: tensorboard-sa
diff --git a/tutorials-and-examples/nvidia-bionemo/base/monitoring/tensorboard-service.yaml b/tutorials-and-examples/nvidia-bionemo/base/monitoring/tensorboard-service.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: tensorboard-service
+  namespace: bionemo-training
+spec:
+  selector:
+    app: tensorboard
+  ports:
+  - port: 6006
+    targetPort: 6006
+  type: ClusterIP
diff --git a/tutorials-and-examples/nvidia-bionemo/base/namespace.yaml b/tutorials-and-examples/nvidia-bionemo/base/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: bionemo-training
diff --git a/tutorials-and-examples/nvidia-bionemo/base/storage/kustomization.yaml b/tutorials-and-examples/nvidia-bionemo/base/storage/kustomization.yaml
@@ -0,0 +1,5 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+- storage-class.yaml
+- pvcs.yaml
diff --git a/tutorials-and-examples/nvidia-bionemo/base/storage/pvcs.yaml b/tutorials-and-examples/nvidia-bionemo/base/storage/pvcs.yaml
@@ -0,0 +1,12 @@
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: bionemo-filestore
+  namespace: bionemo-training
+spec:
+  accessModes:
+  - ReadWriteMany
+  storageClassName: filestore-storage
+  resources:
+    requests:
+      storage: 1Ti
diff --git a/tutorials-and-examples/nvidia-bionemo/base/storage/storage-class.yaml b/tutorials-and-examples/nvidia-bionemo/base/storage/storage-class.yaml
@@ -0,0 +1,10 @@
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: filestore-storage
+provisioner: filestore.csi.storage.gke.io
+volumeBindingMode: Immediate
+allowVolumeExpansion: true
+parameters:
+  tier: BASIC_HDD
+  network: default
diff --git a/tutorials-and-examples/nvidia-bionemo/esm2/README.md b/tutorials-and-examples/nvidia-bionemo/esm2/README.md
diff --git a/tutorials-and-examples/nvidia-bionemo/esm2/create-mount-fs.yaml b/tutorials-and-examples/nvidia-bionemo/esm2/create-mount-fs.yaml
diff --git a/tutorials-and-examples/nvidia-bionemo/esm2/esm2-finetuning.yaml b/tutorials-and-examples/nvidia-bionemo/esm2/esm2-finetuning.yaml
diff --git a/tutorials-and-examples/nvidia-bionemo/esm2/esm2-pretraining.yaml b/tutorials-and-examples/nvidia-bionemo/esm2/esm2-pretraining.yaml
diff --git a/tutorials-and-examples/nvidia-bionemo/esm2/finetuning.py b/tutorials-and-examples/nvidia-bionemo/esm2/finetuning.py
diff --git a/tutorials-and-examples/nvidia-bionemo/fine-tuning/README.md b/tutorials-and-examples/nvidia-bionemo/fine-tuning/README.md
@@ -0,0 +1,203 @@
+# Fine-Tuning ESM2 LLM on GKE using BioNeMo Framework 2.0
+
+This sample walks through setting up a Google Cloud GKE environment to fine-tune ESM2 (Evolutionary Scale Modeling) using NVIDIA BioNeMo Framework 2.0
+
+## Table of Contents
+
+- [Prerequisites](#prerequisites)
+- [Setup](#setup)
+- [Cleanup](#cleanup)
+
+## Prerequisites
+
+- **GCloud SDK:** Ensure you have the Google Cloud SDK installed and configured.
+- **Project:**  A Google Cloud project with billing enabled.
+- **Permissions:**  Sufficient permissions to create GKE clusters and other related resources.
+
+**Note**: Google Cloud shell is recommended to run this sample.
+
+## Setup
+
+1. Set Project:
+
+```bash
+gcloud config set project "your-project-id"
+```
+
+Replace "your-project-id" with your actual project ID.
+
+2. Set Environment Variables:
+
+```bash
+export PROJECT_ID="your-project-id"
+export PUBLIC_REPOSITORY=$PROJECT_ID
+export REGION=us-central1
+export ZONE=us-central1-b
+export CLUSTER_NAME=bionemo-demo
+export NODE_POOL_MACHINE_TYPE=a2-highgpu-2g
+export CLUSTER_MACHINE_TYPE=e2-standard-4
+export GPU_TYPE=nvidia-tesla-a100
+export GPU_COUNT=2
+```
+
+Adjust the zone, machine type, accelerator type, count, and number of nodes as per your requirements. Refer to Google Cloud documentation for available options. Consider smaller machine types for development to manage costs.
+
+3. Enable the Filestore API and create a GKE Cluster
+
+```bash
+gcloud services enable file.googleapis.com --project ${PROJECT_ID}
+```
+
+```bash
+gcloud container clusters create ${CLUSTER_NAME} \
+    --project=${PROJECT_ID} \
+    --location=${ZONE} \
+    --addons=GcpFilestoreCsiDriver \
+    --machine-type=${CLUSTER_MACHINE_TYPE} \
+    --num-nodes=1 \
+    --workload-pool=${PROJECT_ID}.svc.id.goog
+```
+
+4. Create GPU Node Pool:
+
+```bash
+gcloud container node-pools create gpupool \
+    --project=${PROJECT_ID} \
+    --location=${ZONE} \
+    --cluster=${CLUSTER_NAME} \
+    --machine-type=${NODE_POOL_MACHINE_TYPE} \
+    --num-nodes=1 \
+    --accelerator type=${GPU_TYPE},count=${GPU_COUNT},gpu-driver-version=latest
+```
+
+This creates a node pool specifically for GPU workloads.
+
+5. Get Cluster Credentials:
+
+```bash
+gcloud container clusters get-credentials "${CLUSTER_NAME}" \
+--location="${ZONE}"
+```
+
+6. Create an Artifact Registry to store container images
+
+```bash
+gcloud artifacts repositories create ${PUBLIC_REPOSITORY} --repository-format=docker --location=${REGION}
+```
+
+7. Create service account to allow GKE to pull images
+
+```bash
+gcloud iam service-accounts create esm2-inference-gsa \
+    --project=$PROJECT_ID
+```
+
+8. Create namespace, training job, tensorboard microservice, and mount Google cloud Filestore for storage
+
+```bash
+kubectl create namespace bionemo-training
+
+kubectl create serviceaccount esm2-inference-sa -n bionemo-training
+```
+
+9. Create identity binding
+
+```bash
+gcloud iam service-accounts add-iam-policy-binding esm2-inference-gsa@${PROJECT_ID}.iam.gserviceaccount.com --role="roles/iam.workloadIdentityUser" --member="serviceAccount:${PROJECT_ID}.svc.id.goog[bionemo-training/esm2-inference-sa]"
+```
+
+```bash
+kubectl annotate serviceaccount esm2-inference-sa -n bionemo-training iam.gke.io/gcp-service-account=esm2-inference-gsa@$PROJECT_ID.iam.gserviceaccount.com
+```
+Note: this requires workload identity to be configured at the cluster level.
+
+10. Launch fine-tuning job
+
+make sure you are in this directory
+
+```bash
+cd tutorials-and-examples/nvidia-bionemo/
+```
+
+then apply the kustomize file
+
+```bash
+kubectl apply -k fine-tuning/job
+```
+
+11. build and push inference server docker image 
+
+```bash
+docker build -t ${REGION}-docker.pkg.dev/${PROJECT_ID}/${PUBLIC_REPOSITORY}/esm2-inference:latest fine-tuning/inference/.
+```
+
+```bash
+docker push ${REGION}-docker.pkg.dev/${PROJECT_ID}/${PUBLIC_REPOSITORY}/esm2-inference:latest
+```
+
+12. Launch inference deployment
+
+ensure job status is `Complete` by running:
+
+```bash
+kubectl get job esm2-finetuning -n bionemo-training
+```
+
+Ensure environment variables `REGION`, `PROJECT_ID`, and `PUBLIC_REPOSITORY` are fully set.
+
+```bash
+envsubst < fine-tuning/inference/kustomization.yaml | sponge fine-tuning/inference/kustomization.yaml
+```
+
+```bash
+kubectl apply -k fine-tuning/inference
+```
+
+13. Port Forwarding (for inference):
+
+List deployment PODs 
+
+```bash
+kubectl get pods -l app=esm2-inference -n bionemo-training
+```
+
+Once the inference POD is under `Running` status, run:
+
+```bash
+kubectl port-forward -n bionemo-training svc/esm2-inference 8080:80
+```
+
+in a separate shell window, run:
+
+```bash
+curl -X POST http://localhost:8080/predict \
+  -H "Content-Type: application/json" \
+  -d '{"sequence": "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"}'
+```
+
+## Cleanup
+
+To delete the cluster and all associated resources:
+
+```bash
+kubectl delete namespace bionemo-training --cascade=background
+```
+
+```bash
+gcloud container clusters delete "${CLUSTER_NAME}" --location="${ZONE}" --quiet
+```
+
+```bash
+gcloud artifacts repositories delete ${PUBLIC_REPOSITORY} \
+    --location=${REGION} \
+    --quiet
+```
+
+```bash
+gcloud iam service-accounts delete esm2-inference-gsa@${PROJECT_ID}.iam.gserviceaccount.com \
+    --quiet
+```
+
+```bash
+docker rmi ${REGION}-docker.pkg.dev/${PROJECT_ID}/${PUBLIC_REPOSITORY}/esm2-inference:latest
+```
diff --git a/tutorials-and-examples/nvidia-bionemo/fine-tuning/inference/Dockerfile b/tutorials-and-examples/nvidia-bionemo/fine-tuning/inference/Dockerfile
@@ -0,0 +1,27 @@
+FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
+
+RUN groupadd -r model && useradd -r -g model model
+
+RUN pip install --no-cache-dir \
+    torch \
+    transformers \
+    fastapi \
+    uvicorn \
+    numpy \
+    sentencepiece \
+    protobuf \
+    fair-esm
+
+WORKDIR /app
+COPY inference_server.py .
+
+RUN chown -R model:model /app
+
+USER model
+
+EXPOSE 8000
+
+ENV MODEL_PATH=/mnt/data/model
+ENV CUDA_LAUNCH_BLOCKING=1
+
+CMD ["uvicorn", "inference_server:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]
diff --git a/tutorials-and-examples/nvidia-bionemo/fine-tuning/inference/deployment.yaml b/tutorials-and-examples/nvidia-bionemo/fine-tuning/inference/deployment.yaml
@@ -0,0 +1,77 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  namespace: bionemo-training
+  name: esm2-inference
+  labels:
+    app: esm2-inference
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: esm2-inference
+  template:
+    metadata:
+      labels:
+        app: esm2-inference
+    spec:
+      serviceAccountName: esm2-inference-sa
+      containers:
+      - name: inference
+        image: esm2-inference-image
+        ports:
+        - containerPort: 8000
+          name: http
+        env:
+        - name: MODEL_PATH
+          value: "/mnt/data/model"
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+            memory: "4Gi"
+            cpu: "2"
+          requests:
+            nvidia.com/gpu: 1
+            memory: "2Gi"
+            cpu: "500m"
+        volumeMounts:
+        - name: model-storage
+          mountPath: /mnt/data
+          readOnly: true
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 15
+          periodSeconds: 5
+        securityContext:
+          runAsNonRoot: true
+          runAsUser: 1000
+          allowPrivilegeEscalation: false
+      volumes:
+      - name: model-storage
+        persistentVolumeClaim:
+          claimName: bionemo-filestore
+          readOnly: true
+      nodeSelector:
+        cloud.google.com/gke-gpu: "true"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: esm2-inference
+spec:
+  selector:
+    app: esm2-inference
+  ports:
+  - port: 80
+    targetPort: 8000
+    protocol: TCP
+    name: http
+  type: ClusterIP
diff --git a/tutorials-and-examples/nvidia-bionemo/fine-tuning/inference/inference_server.py b/tutorials-and-examples/nvidia-bionemo/fine-tuning/inference/inference_server.py
@@ -0,0 +1,167 @@
+# inference_server.py
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import torch
+from transformers import AutoModel, PreTrainedTokenizer, EsmConfig
+import logging
+import os
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+app = FastAPI()
+
+class CustomEsmTokenizer(PreTrainedTokenizer):
+    def __init__(self, **kwargs):
+        # Initialize vocabulary first
+        self.vocab = [
+            "<pad>", "<mask>", "<cls>", "<sep>", "<unk>",
+            "L", "A", "G", "V", "S", "E", "R", "T", "I", "D",
+            "P", "K", "Q", "N", "F", "Y", "M", "H", "W", "C",
+            "X", "B", "U", "Z", "O", ".", "-", "*"
+        ]
+        self.ids_to_tokens = dict(enumerate(self.vocab))
+        self.tokens_to_ids = {tok: i for i, tok in enumerate(self.vocab)}
+
+        # Set special token attributes
+        kwargs["pad_token"] = "<pad>"
+        kwargs["mask_token"] = "<mask>"
+        kwargs["cls_token"] = "<cls>"
+        kwargs["sep_token"] = "<sep>"
+        kwargs["unk_token"] = "<unk>"
+
+        # Now call parent constructor
+        super().__init__(**kwargs)
+
+    def get_vocab(self):
+        return self.tokens_to_ids.copy()
+
+    def _tokenize(self, text):
+        return list(text.strip().upper())
+
+    def _convert_token_to_id(self, token):
+        return self.tokens_to_ids.get(token, self.tokens_to_ids["<unk>"])
+
+    def _convert_id_to_token(self, index):
+        return self.ids_to_tokens.get(index, "<unk>")
+
+    def convert_tokens_to_string(self, tokens):
+        return "".join(tokens)
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def save_vocabulary(self, save_directory):
+        vocab_file = os.path.join(save_directory, "vocab.txt")
+        with open(vocab_file, "w") as f:
+            f.write("\n".join(self.vocab))
+        return (vocab_file,)
+
+class InferenceRequest(BaseModel):
+    sequence: str
+
+@app.on_event("startup")
+async def load_model():
+    global model, tokenizer, config
+    try:
+        model_path = os.getenv("MODEL_PATH", "/mnt/data/model")
+        logger.info(f"Loading model from {model_path}")
+
+        # Load config
+        config = EsmConfig.from_pretrained(model_path)
+        logger.info(f"Model config loaded: vocab_size={config.vocab_size}")
+
+        # Create custom tokenizer
+        tokenizer = CustomEsmTokenizer()
+        logger.info(f"Created custom tokenizer with vocab size: {tokenizer.vocab_size}")
+
+        # Load model
+        model = AutoModel.from_pretrained(model_path)
+        model = model.eval()
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            logger.info("Model loaded on GPU")
+
+        # Test tokenization
+        test_seq = "MKTV"
+        test_tokens = tokenizer(
+            test_seq,
+            return_tensors="pt",
+            padding=True,
+            truncation=True
+        )
+        logger.info(f"Test tokenization shape: {test_tokens['input_ids'].shape}")
+        logger.info(f"Test token values: {test_tokens['input_ids'].tolist()}")
+
+        logger.info("Model and tokenizer loaded successfully")
+
+    except Exception as e:
+        logger.error(f"Error loading model: {str(e)}")
+        if os.path.exists(model_path):
+            logger.error(f"Directory contents: {os.listdir(model_path)}")
+        raise RuntimeError(f"Failed to load model: {str(e)}")
+
+@app.post("/predict")
+async def predict(request: InferenceRequest):
+    try:
+        # Validate input
+        if not request.sequence or len(request.sequence.strip()) == 0:
+            raise HTTPException(status_code=400, detail="Empty sequence provided")
+
+        logger.info(f"Processing sequence of length: {len(request.sequence)}")
+
+        # Tokenize
+        inputs = tokenizer(
+            request.sequence,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=1024
+        )
+
+        # Remove token_type_ids as ESM doesn't use them
+        if 'token_type_ids' in inputs:
+            del inputs['token_type_ids']
+
+        logger.info(f"Tokenized shape: {inputs['input_ids'].shape}")
+
+        # Move to GPU if available
+        if torch.cuda.is_available():
+            inputs = {k: v.cuda() for k, v in inputs.items()}
+
+        # Perform inference
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # Get embeddings from last hidden state
+        embeddings = outputs.last_hidden_state.mean(dim=1)
+
+        return {
+            "embeddings": embeddings.cpu().numpy().tolist()[0],
+            "sequence_length": len(request.sequence),
+            "input_length": inputs['input_ids'].shape[1]
+        }
+
+    except Exception as e:
+        logger.error(f"Inference error: {str(e)}")
+        logger.error("Detailed traceback:", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Inference failed: {str(e)}"
+        )
+
+    except Exception as e:
+        logger.error(f"Inference error: {str(e)}")
+        logger.error("Detailed traceback:", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Inference failed: {str(e)}"
+        )
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    if model is None or tokenizer is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    return {"status": "healthy"}
diff --git a/tutorials-and-examples/nvidia-bionemo/fine-tuning/inference/kustomization.yaml b/tutorials-and-examples/nvidia-bionemo/fine-tuning/inference/kustomization.yaml
@@ -0,0 +1,9 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: bionemo-training  
+resources:
+- deployment.yaml
+images:
+- name: esm2-inference-image
+  newName: ${REGION}-docker.pkg.dev/${PROJECT_ID}/${PUBLIC_REPOSITORY}/esm2-inference
+  newTag: latest
diff --git a/tutorials-and-examples/nvidia-bionemo/fine-tuning/job/finetuning.py b/tutorials-and-examples/nvidia-bionemo/fine-tuning/job/finetuning.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+import os
+import pandas as pd
+import subprocess
+import torch
+import re
+import warnings
+import shutil
+import json
+
+from bionemo.core.data.load import load
+
+warnings.filterwarnings("ignore")
+warnings.simplefilter("ignore")
+
+print("Setting WorkDir ...")
+work_dir = "/workspace/bionemo2/esm2_finetune_tutorial"
+
+if not os.path.exists(work_dir):
+    os.makedirs(work_dir)
+    print(f"Work dir: '{work_dir}' created.")
+
+print("Downloading pre-trained model checkpoints ...")
+
+checkpoint_path = load("esm2/650m:2.0")
+print(checkpoint_path)
+
+print("Entering regression ...")
+
+def save_model_for_inference(checkpoint_dir, save_path):
+    """Save the model in a format compatible with Hugging Face Transformers."""
+    os.makedirs(save_path, exist_ok=True)
+
+    print(f"Loading checkpoint from directory: {checkpoint_dir}")
+
+    try:
+        # Check the weights directory
+        weights_dir = os.path.join(checkpoint_dir, "weights")
+        if not os.path.exists(weights_dir):
+            raise FileNotFoundError(f"Weights directory not found in {checkpoint_dir}")
+
+        print(f"Contents of weights directory:")
+        for file in os.listdir(weights_dir):
+            print(f"- {file}")
+
+        # Load weights from the weights directory
+        weight_files = [f for f in os.listdir(weights_dir) if f.endswith('.pt')]
+        if not weight_files:
+            raise FileNotFoundError(f"No weight files found in {weights_dir}")
+
+        model_file = os.path.join(weights_dir, weight_files[0])
+        print(f"Loading model weights from: {model_file}")
+
+        checkpoint = torch.load(model_file)
+        print("Checkpoint loaded successfully")
+
+        # Save the model weights
+        if isinstance(checkpoint, dict):
+            if 'state_dict' in checkpoint:
+                state_dict = checkpoint['state_dict']
+            else:
+                state_dict = checkpoint
+        else:
+            state_dict = checkpoint
+
+        torch.save(state_dict, os.path.join(save_path, "pytorch_model.bin"))
+        print("Model weights saved successfully")
+
+        # Save the ESM vocabulary file
+        vocab_file = os.path.join(save_path, "vocab.txt")
+        vocab = [
+            "<pad>", "<mask>", "<cls>", "<sep>", "<unk>",
+            "L", "A", "G", "V", "S", "E", "R", "T", "I", "D",
+            "P", "K", "Q", "N", "F", "Y", "M", "H", "W", "C",
+            "X", "B", "U", "Z", "O", ".", "-", "*"
+        ]
+        with open(vocab_file, "w") as f:
+            f.write("\n".join(vocab))
+        print("Vocabulary file saved successfully")
+
+        # Create and save the config
+        config = {
+            "model_type": "esm",
+            "architectures": ["ESMForSequenceClassification"],
+            "hidden_size": 1280,
+            "num_attention_heads": 20,
+            "num_hidden_layers": 33,
+            "vocab_size": 33,
+            "max_position_embeddings": 1024,
+            "pad_token_id": 1,
+            "eos_token_id": 2,
+            "hidden_act": "gelu",
+            "attention_probs_dropout_prob": 0.0,
+            "hidden_dropout_prob": 0.0,
+            "initializer_range": 0.02,
+            "layer_norm_eps": 1e-5,
+            "position_embedding_type": "absolute"
+        }
+
+        with open(os.path.join(save_path, "config.json"), "w") as f:
+            json.dump(config, f, indent=2)
+        print("Config saved successfully")
+
+        # Create tokenizer config with vocab file reference
+        tokenizer_config = {
+            "model_max_length": 1024,
+            "padding_side": "right",
+            "truncation_side": "right",
+            "vocab_file": "vocab.txt",
+            "do_lower_case": False,
+            "special_tokens_map_file": None
+        }
+
+        with open(os.path.join(save_path, "tokenizer_config.json"), "w") as f:
+            json.dump(tokenizer_config, f, indent=2)
+        print("Tokenizer config saved successfully")
+
+    except Exception as e:
+        print(f"Error during model saving: {str(e)}")
+        raise
+def run_finetune_esm2():
+    """Runs the finetune_esm2 command using subprocess."""
+    command = ["python", "-m", "bionemo.esm2.model.finetune.train"]
+
+    try:
+        result = subprocess.run(command, check=True, capture_output=True, text=True)
+        print("finetune_esm2 output:\n", result.stdout)
+        match = re.search(r"checkpoint stored at (.*)", result.stdout)
+        checkpoint_path = match.group(1).strip()
+        print(f"Checkpoint path: {checkpoint_path}")
+        return checkpoint_path
+
+    except subprocess.CalledProcessError as e:
+        print(f"finetune_esm2 failed with return code: {e.returncode}")
+        print("stderr:\n", e.stderr)
+        return None
+
+def run_infer_esm2(work_dir, checkpoint_path, results_path):
+    """Runs the infer_esm2 command using subprocess."""
+    artificial_sequence_data = [
+        "TLILGWSDKLGSLLNQLAIANESLGGGTIAVMAERDKEDMELDIGKMEFDFKGTSVI",
+        "LYSGDHSTQGARFLRDLAENTGRAEYELLSLF",
+        "GRFNVWLGGNESKIRQVLKAVKEIGVSPTLFAVYEKN",
+        "DELTALGGLLHDIGKPVQRAGLYSGDHSTQGARFLRDLAENTGRAEYELLSLF",
+        "KLGSLLNQLAIANESLGGGTIAVMAERDKEDMELDIGKMEFDFKGTSVI",
+        "LFGAIGNAISAIHGQSAVEELVDAFVGGARISSAFPYSGDTYYLPKP",
+        "LGGLLHDIGKPVQRAGLYSGDHSTQGARFLRDLAENTGRAEYELLSLF",
+        "LYSGDHSTQGARFLRDLAENTGRAEYELLSLF",
+        "ISAIHGQSAVEELVDAFVGGARISSAFPYSGDTYYLPKP",
+        "SGSKASSDSQDANQCCTSCEDNAPATSYCVECSEPLCETCVEAHQRVKYTKDHTVRSTGPAKT",
+    ]
+
+    df = pd.DataFrame(artificial_sequence_data, columns=["sequences"])
+    data_path = os.path.join(work_dir, "sequences.csv")
+    df.to_csv(data_path, index=False)
+
+    command = [
+        "infer_esm2",
+        "--checkpoint-path", checkpoint_path,
+        "--data-path", data_path,
+        "--results-path", results_path,
+        "--config-class", "ESM2FineTuneSeqConfig",
+    ]
+
+    try:
+        result = subprocess.run(command, check=True, capture_output=True, text=True)
+        print("infer_esm2 output:\n", result.stdout)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"infer_esm2 failed with return code: {e.returncode}")
+        print("stderr:\n", e.stderr)
+        return False
+
+checkpoint_path = run_finetune_esm2()
+if checkpoint_path is not None:
+    print("finetune_esm2 completed successfully.")
+    results_path = work_dir
+
+    print("Starting Inference ...")
+    if run_infer_esm2(work_dir, checkpoint_path, results_path):
+        print("Inference completed successfully.")
+
+        # Save model in the format expected by the inference server
+        inference_model_path = "/mnt/data/model"
+        save_model_for_inference(checkpoint_path, inference_model_path)
+        print(f"Model saved for inference at {inference_model_path}")
+
+    else:
+        print("Inference failed.")
+else:
+    print("finetune_esm2 failed.")
diff --git a/tutorials-and-examples/nvidia-bionemo/fine-tuning/job/job.yaml b/tutorials-and-examples/nvidia-bionemo/fine-tuning/job/job.yaml
@@ -0,0 +1,33 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: esm2-finetuning
+  namespace: bionemo-training
+spec:
+  backoffLimit: 3  # Number of retries before marking job as failed
+  template:
+    spec:
+      containers:
+      - name: finetuning
+        image: nvcr.io/nvidia/clara/bionemo-framework:2.3
+        command: ["python3"]
+        args: ["/app/finetuning.py"]
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+        volumeMounts:
+        - name: bionemo-storage
+          mountPath: /mnt/data
+        - name: scripts
+          mountPath: /app
+      volumes:
+      - name: bionemo-storage
+        persistentVolumeClaim:
+          claimName: bionemo-filestore
+      - name: scripts
+        configMap:
+          name: finetuning-script
+          defaultMode: 0755  
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-gpu: "true"
diff --git a/tutorials-and-examples/nvidia-bionemo/fine-tuning/job/kustomization.yaml b/tutorials-and-examples/nvidia-bionemo/fine-tuning/job/kustomization.yaml
@@ -0,0 +1,11 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: bionemo-training  
+resources:
+- ../../base/storage
+- job.yaml
+
+configMapGenerator:
+- name: finetuning-script
+  files:
+  - finetuning.py
diff --git a/tutorials-and-examples/nvidia-bionemo/pretraining/.DS_Store b/tutorials-and-examples/nvidia-bionemo/pretraining/.DS_Store
diff --git a/...-and-examples/nvidia-bionemo/esm2/LICENSE → ...amples/nvidia-bionemo/pretraining/LICENSE b/...-and-examples/nvidia-bionemo/esm2/LICENSE → ...amples/nvidia-bionemo/pretraining/LICENSE
diff --git a/tutorials-and-examples/nvidia-bionemo/pretraining/README.md b/tutorials-and-examples/nvidia-bionemo/pretraining/README.md
@@ -0,0 +1,124 @@
+# Training ESM2 LLM on GKE using BioNeMo Framework 2.0
+
+This samples walks through setting up a Google Cloud GKE environment to train ESM2 (Evolutionary Scale Modeling) using NVIDIA BioNeMo Framework 2.0
+
+## Table of Contents
+
+- [Prerequisites](#prerequisites)
+- [Setup](#setup)
+- [Cleanup](#cleanup)
+
+## Prerequisites
+
+- **GCloud SDK:** Ensure you have the Google Cloud SDK installed and configured.
+- **Project:**  A Google Cloud project with billing enabled.
+- **Permissions:**  Sufficient permissions to create GKE clusters and other related resources.
+
+## Setup
+
+1. Set Project:
+
+```bash
+gcloud config set project "your-project-id"
+```
+
+Replace "your-project-id" with your actual project ID.
+
+2. Set Environment Variables:
+
+```bash
+export PROJECT_ID="your-project-id"
+export REGION=us-central1
+export ZONE=us-central1-a
+export CLUSTER_NAME=bionemo-demo
+export NODE_POOL_MACHINE_TYPE=a2-highgpu-2g
+export CLUSTER_MACHINE_TYPE=e2-standard-4
+export GPU_TYPE=nvidia-tesla-a100
+export GPU_COUNT=2
+```
+
+Adjust the zone, machine type, accelerator type, count, and number of nodes as per your requirements. Refer to Google Cloud documentation for available options. Consider smaller machine types for development to manage costs.
+
+3. Enable the Filestore API and create a GKE Cluster
+
+```bash
+gcloud services enable file.googleapis.com --project ${PROJECT_ID}
-gcloud services enable file.googleapis.com --project ${PROJECT_ID}
+gcloud services enable file.googleapis.com
-gcloud services enable file.googleapis.com --project ${PROJECT_ID}
+gcloud services enable file.googleapis.com
+```
+
+```bash
+gcloud container clusters create ${CLUSTER_NAME} \
+    --project=${PROJECT_ID} \
+    --location=${ZONE} \
+    --addons=GcpFilestoreCsiDriver \
+    --machine-type=${CLUSTER_MACHINE_TYPE} \
+    --num-nodes=1
+```
+
+4. Create GPU Node Pool:
+
+```bash
+gcloud container node-pools create gpupool \
+    --project=${PROJECT_ID} \
+    --location=${ZONE} \
+    --cluster=${CLUSTER_NAME} \
+    --machine-type=${NODE_POOL_MACHINE_TYPE} \
+    --num-nodes=1 \
+    --accelerator type=${GPU_TYPE},count=${GPU_COUNT},gpu-driver-version=latest 
+```
+
+This creates a node pool specifically for GPU workloads.
+
+5. Get Cluster Credentials:
+
+```bash
+gcloud container clusters get-credentials "${CLUSTER_NAME}" \
+--location="${ZONE}"
+```
+
+6. Create namespace, training job, tensorboard microservice, and mount Google cloud Filestore for storage
+
+```bash
+alias k=kubectl
+```
+
+make sure you are in this directory
+
+```bash
+cd tutorials-and-examples/nvidia-bionemo/
+```
+
+then run:
+
+```bash
+k apply -k pretraining/
+```
+
+7. Port Forwarding (for TensorBoard):
+
+List PODs and ensure tensorboard POD is under `Running` status
+
+```bash
+k get pods -n bionemo-training
+```
+
+```bash
+k port-forward -n bionemo-training svc/tensorboard-service 8080:6006
+```
+
+9. View Tensorboard logs
+
+On your local machine: Browse to <http://localhost:8080> port forward from above step timeseries and see the loss curves as show below.
+
+>Note: tensorboard dashboards will take some time to show up as the bioenemo job takes a few minutes to kick off. Then, the full plots will show up once the job's POD is under `COMPLETED` status.
+
+[<img src="./images/tensorboard-results.png" width="750"/>](HighLevelArch)
+
+## Cleanup
+
+To delete the cluster and all associated resources:
+
+```bash
+k delete -k pretraining/
+
+gcloud container clusters delete "${CLUSTER_NAME}" --location="${ZONE}" --quiet
+```
diff --git a/tutorials-and-examples/nvidia-bionemo/pretraining/configmaps/startup-script.yaml b/tutorials-and-examples/nvidia-bionemo/pretraining/configmaps/startup-script.yaml
@@ -0,0 +1,24 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: bionemo-startup-script
+  namespace: bionemo-training
+data:
+  start.sh: |
+    #!/bin/bash
+    nvidia-smi
+    
+    DATA_DIR=/mnt/data
+    download_bionemo_data esm2/testdata_esm2_pretrain:2.0 --source ngc
+    
+    mv /root/.cache/bionemo/006911f92bbc0ded7ea302bbdbfab4c694b409e699c32fd49de1c527a99dba3e-2024_03_sanity.tar.gz.untar/2024_03_sanity $DATA_DIR
+    
+    export PATH=$PATH:/usr/local/nvidia/lib64/
+    /sbin/ldconfig
+    
+    python scripts/protein/esm2/esm2_pretrain.py \
+      --train-cluster-path ${DATA_DIR}/2024_03_sanity/train_clusters_sanity.parquet \
+      --train-database-path ${DATA_DIR}/2024_03_sanity/train_sanity.db \
+      --valid-cluster-path ${DATA_DIR}/2024_03_sanity/valid_clusters.parquet \
+      --valid-database-path ${DATA_DIR}/2024_03_sanity/validation.db \
+      $(cat /config/training-params)
diff --git a/tutorials-and-examples/nvidia-bionemo/pretraining/configmaps/training-config.yaml b/tutorials-and-examples/nvidia-bionemo/pretraining/configmaps/training-config.yaml
@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: bionemo-training-config
+  namespace: bionemo-training
+data:
+  training-params: |
+    --precision=bf16-mixed
+    --num-gpus 1
+    --num-nodes 1
+    --num-steps 100
+    --val-check-interval 25
+    --max-seq-length 1024
+    --limit-val-batches 2
+    --micro-batch-size 2
+    --num-layers 33
+    --hidden-size 1280
+    --num-attention-head 20
+    --ffn-hidden-size 5120
+    --tensor-model-parallel-size 1
+    --create-tensorboard-logger
diff --git a/...onemo/esm2/images/tensorboard-results.png → ...retraining/images/tensorboard-results.png b/...onemo/esm2/images/tensorboard-results.png → ...retraining/images/tensorboard-results.png
diff --git a/tutorials-and-examples/nvidia-bionemo/pretraining/job.yaml b/tutorials-and-examples/nvidia-bionemo/pretraining/job.yaml
@@ -0,0 +1,41 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: bionemo-training
+  namespace: bionemo-training
+spec:
+  template:
+    spec:
+      containers:
+      - name: training
+        image: nvcr.io/nvidia/clara/bionemo-framework:2.1
+        command: ["/bin/bash"]
+        args: ["/scripts/start.sh"]
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+        volumeMounts:
+        - name: bionemo-storage
+          mountPath: /mnt/data
+          subPath: training-data
+        - name: bionemo-storage
+          mountPath: /workspace/bionemo2/results
+          subPath: tensorboard-logs
+        - name: config-volume
+          mountPath: /config
+        - name: script-volume
+          mountPath: /scripts
+      volumes:
+      - name: bionemo-storage
+        persistentVolumeClaim:
+          claimName: bionemo-filestore
+      - name: config-volume
+        configMap:
+          name: bionemo-training-config
+      - name: script-volume
+        configMap:
+          name: bionemo-startup-script
+          defaultMode: 0777
+      restartPolicy: Never
+      nodeSelector:
+        cloud.google.com/gke-gpu: "true"
diff --git a/tutorials-and-examples/nvidia-bionemo/pretraining/kustomization.yaml b/tutorials-and-examples/nvidia-bionemo/pretraining/kustomization.yaml
@@ -0,0 +1,8 @@
+# pretraining/kustomization.yaml
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+- ../base
+- configmaps/training-config.yaml
+- configmaps/startup-script.yaml
+- job.yaml
diff --git a/tutorials-and-examples/nvidia-bionemo/requirements.txt b/tutorials-and-examples/nvidia-bionemo/requirements.txt
@@ -0,0 +1,2 @@
+pytest
+kubernetes
diff --git a/tutorials-and-examples/nvidia-bionemo/tests/base/unit/test_monitoring.py b/tutorials-and-examples/nvidia-bionemo/tests/base/unit/test_monitoring.py
@@ -0,0 +1,34 @@
+import pytest
+from utils.kubernetes_helpers import load_yaml_manifest
+
+@pytest.fixture
+def tensorboard_deployment():
+    return load_yaml_manifest('base/monitoring/tensorboard-deployment.yaml')
+
+@pytest.fixture
+def tensorboard_service():
+    return load_yaml_manifest('base/monitoring/tensorboard-service.yaml')
+
+def test_tensorboard_basic_structure(tensorboard_deployment):
+    assert tensorboard_deployment['apiVersion'] == 'apps/v1'
+    assert tensorboard_deployment['kind'] == 'Deployment'
+    assert tensorboard_deployment['metadata']['name'] == 'tensorboard'
+    assert tensorboard_deployment['metadata']['namespace'] == 'bionemo-training'
+
+def test_tensorboard_deployment_spec(tensorboard_deployment):
+    spec = tensorboard_deployment['spec']['template']['spec']
+    assert spec['serviceAccountName'] == 'tensorboard-sa'
+
+    container = spec['containers'][0]
+    assert container['image'] == 'tensorflow/tensorflow:latest'
+    assert container['command'] == ['tensorboard']
+    assert '--port=6006' in container['args']
+    assert container['volumeMounts'][0]['name'] == 'bionemo-storage'
+    assert container['volumeMounts'][0]['mountPath'] == '/workspace/bionemo2/results'
+
+def test_tensorboard_service_spec(tensorboard_service):
+    spec = tensorboard_service['spec']
+    assert spec['selector']['app'] == 'tensorboard'
+    assert spec['ports'][0]['port'] == 6006
+    assert spec['ports'][0]['targetPort'] == 6006
+    assert spec['type'] == 'ClusterIP'
diff --git a/tutorials-and-examples/nvidia-bionemo/tests/base/unit/test_storage.py b/tutorials-and-examples/nvidia-bionemo/tests/base/unit/test_storage.py
@@ -0,0 +1,26 @@
+import pytest
+from utils.kubernetes_helpers import load_yaml_manifest
+
+@pytest.fixture
+def storage_class_manifest():
+    return load_yaml_manifest('base/storage/storage-class.yaml')
+
+@pytest.fixture
+def pvc_manifest():
+    return load_yaml_manifest('base/storage/pvcs.yaml')
+
+def test_filestore_storage_class(storage_class_manifest):
+    filestore = storage_class_manifest
+
+    assert filestore['provisioner'] == 'filestore.csi.storage.gke.io'
+    assert filestore['parameters']['tier'] == 'BASIC_HDD'
+    assert filestore['volumeBindingMode'] == 'Immediate'
+    assert filestore['allowVolumeExpansion'] is True
+
+def test_bionemo_pvc(pvc_manifest):
+    pvc = pvc_manifest
+
+    assert pvc['metadata']['namespace'] == 'bionemo-training'
+    assert pvc['spec']['accessModes'] == ['ReadWriteMany']
+    assert pvc['spec']['storageClassName'] == 'filestore-storage'
+    assert pvc['spec']['resources']['requests']['storage'] == '1Ti'
diff --git a/tutorials-and-examples/nvidia-bionemo/tests/conftest.py b/tutorials-and-examples/nvidia-bionemo/tests/conftest.py
@@ -0,0 +1,10 @@
+import pytest
+import os
+from kubernetes import config
+
+@pytest.fixture(scope="session")
+def kube_config():
+    if os.getenv('KUBECONFIG'):
+        config.load_kube_config()
+    else:
+        config.load_incluster_config()
diff --git a/tutorials-and-examples/nvidia-bionemo/tests/pretraining/e2e/bionemo-e2e-test.yaml b/tutorials-and-examples/nvidia-bionemo/tests/pretraining/e2e/bionemo-e2e-test.yaml
@@ -0,0 +1,30 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: bionemo-e2e-test
+spec:
+  template:
+    spec:
+      containers:
+      - name: e2e-test
+        image: nvcr.io/nvidia/clara/bionemo-framework:2.1
+        command: ["/bin/bash", "-c"]
+        args:
+        - |
+          # Run minimal training job
+          python scripts/protein/esm2/esm2_pretrain.py \
+            --num-steps 10 \
+            --val-check-interval 5 \
+            --micro-batch-size 1 \
+            --num-layers 2 \
+            # Minimal configuration for testing
+          
+          # Check for training artifacts
+          if [ ! -d "/workspace/bionemo2/results/lightning_logs" ]; then
+            echo "Training artifacts not found"
+            exit 1
+          fi
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+      restartPolicy: Never
diff --git a/...s-and-examples/nvidia-bionemo/tests/pretraining/integration/bionemo-integration-test.yaml b/...s-and-examples/nvidia-bionemo/tests/pretraining/integration/bionemo-integration-test.yaml
@@ -0,0 +1,30 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: bionemo-integration-test
+spec:
+  template:
+    spec:
+      containers:
+      - name: test-runner
+        image: nvcr.io/nvidia/clara/bionemo-framework:2.1
+        command: ["/bin/bash", "-c"]
+        args:
+        - |
+          # Test data download
+          download_bionemo_data esm2/testdata_esm2_pretrain:2.0 --source ngc
+          if [ $? -ne 0 ]; then exit 1; fi
+          
+          # Test GPU availability
+          nvidia-smi
+          if [ $? -ne 0 ]; then exit 1; fi
+          
+          # Test volume mount
+          if [ ! -d "/mnt/data" ]; then exit 1; fi
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+        volumeMounts:
+        - name: fileserver
+          mountPath: /mnt/data
+      restartPolicy: Never
diff --git a/tutorials-and-examples/nvidia-bionemo/tests/pretraining/unit/test_configmaps.py b/tutorials-and-examples/nvidia-bionemo/tests/pretraining/unit/test_configmaps.py
@@ -0,0 +1,21 @@
+# tests/pretraining/test_configmaps.py
+import pytest
+from utils.kubernetes_helpers import load_yaml_manifest
+
+@pytest.fixture
+def training_config():
+    return load_yaml_manifest('pretraining/configmaps/training-config.yaml')
+
+@pytest.fixture
+def startup_script():
+    return load_yaml_manifest('pretraining/configmaps/startup-script.yaml')
+
+def test_training_params_content(training_config):
+    params = training_config['data']['training-params']
+    assert '--precision=bf16-mixed' in params
+    assert '--num-gpus 1' in params
+    assert '--num-nodes 1' in params
+
+def test_startup_script_permissions(startup_script):
+    assert startup_script['metadata']['name'] == 'bionemo-startup-script'
+    assert 'start.sh' in startup_script['data']
diff --git a/tutorials-and-examples/nvidia-bionemo/tests/pretraining/unit/test_job.py b/tutorials-and-examples/nvidia-bionemo/tests/pretraining/unit/test_job.py
@@ -0,0 +1,44 @@
+import pytest
+from utils.kubernetes_helpers import load_yaml_manifest
+
+@pytest.fixture
+def job_manifest():
+    return load_yaml_manifest('pretraining/job.yaml')
+
+def test_job_basic_structure(job_manifest):
+    assert job_manifest['apiVersion'] == 'batch/v1'
+    assert job_manifest['kind'] == 'Job'
+    assert job_manifest['metadata']['name'] == 'bionemo-training'
+    assert job_manifest['metadata']['namespace'] == 'bionemo-training'
+
+def test_container_resources(job_manifest):
+    container = job_manifest['spec']['template']['spec']['containers'][0]
+    assert container['resources']['limits']['nvidia.com/gpu'] == 1
+
+def test_volume_mounts(job_manifest):
+    container = job_manifest['spec']['template']['spec']['containers'][0]
+    volume_mounts = {mount['name']: mount for mount in container['volumeMounts']}
+
+    # Check bionemo-storage mounts
+    assert 'bionemo-storage' in volume_mounts
+    training_mount = next(
+        mount for mount in container['volumeMounts']
+        if mount['mountPath'] == '/mnt/data'
+    )
+    logs_mount = next(
+        mount for mount in container['volumeMounts']
+        if mount['mountPath'] == '/workspace/bionemo2/results'
+    )
+
+    assert training_mount['subPath'] == 'training-data'
+    assert logs_mount['subPath'] == 'tensorboard-logs'
+
+    # Check other required mounts
+    assert 'config-volume' in volume_mounts
+    assert 'script-volume' in volume_mounts
+
+def test_volumes_configuration(job_manifest):
+    volumes = {vol['name']: vol for vol in job_manifest['spec']['template']['spec']['volumes']}
+
+    assert 'bionemo-storage' in volumes
+    assert volumes['bionemo-storage']['persistentVolumeClaim']['claimName'] == 'bionemo-filestore'
diff --git a/tutorials-and-examples/nvidia-bionemo/tests/utils/kubernetes_helpers.py b/tutorials-and-examples/nvidia-bionemo/tests/utils/kubernetes_helpers.py
@@ -0,0 +1,15 @@
+import yaml
+from kubernetes import client, config
+from kubernetes.client.rest import ApiException
+
+def load_yaml_manifest(file_path):
+    with open(file_path, 'r') as f:
+        documents = list(yaml.safe_load_all(f))
+        return documents[0] if len(documents) == 1 else documents
+
+def validate_k8s_object(manifest):
+    try:
+        client.ApiClient().sanitize_for_serialization(manifest)
+        return True
+    except ApiException as e:
+        return False
diff --git a/tutorials-and-examples/nvidia-bionemo/tests/utils/yaml_validators.py b/tutorials-and-examples/nvidia-bionemo/tests/utils/yaml_validators.py
@@ -0,0 +1,159 @@
+# tests/utils/yaml_validators.py
+import yaml
+from typing import Dict, Any, List
+from kubernetes import client
+from kubernetes.client.rest import ApiException
+
+class YAMLValidationError(Exception):
+    """Custom exception for YAML validation errors"""
+    pass
+
+def validate_yaml_syntax(content: str) -> bool:
+    """
+    Validates basic YAML syntax
+    
+    Args:
+        content: YAML content as string
+    
+    Returns:
+        bool: True if valid, raises YAMLValidationError if invalid
+    """
+    try:
+        yaml.safe_load(content)
+        return True
+    except yaml.YAMLError as e:
+        raise YAMLValidationError(f"Invalid YAML syntax: {str(e)}")
+
+def validate_k8s_resource_requirements(container: Dict[str, Any]) -> bool:
+    """
+    Validates Kubernetes container resource requirements
+    
+    Args:
+        container: Container spec dictionary
+    
+    Returns:
+        bool: True if valid, raises YAMLValidationError if invalid
+    """
+    if 'resources' not in container:
+        raise YAMLValidationError("Container must specify resource requirements")
+
+    resources = container['resources']
+    if 'limits' not in resources or 'requests' not in resources:
+        raise YAMLValidationError("Container must specify both resource limits and requests")
+
+    return True
+
+def validate_volume_mounts(container: Dict[str, Any], volumes: List[Dict[str, Any]]) -> bool:
+    """
+    Validates that all volume mounts in a container have corresponding volumes
+    
+    Args:
+        container: Container spec dictionary
+        volumes: List of volume specifications
+    
+    Returns:
+        bool: True if valid, raises YAMLValidationError if invalid
+    """
+    if 'volumeMounts' not in container:
+        return True
+
+    volume_names = {vol['name'] for vol in volumes}
+    mount_names = {mount['name'] for mount in container['volumeMounts']}
+
+    missing_volumes = mount_names - volume_names
+    if missing_volumes:
+        raise YAMLValidationError(
+            f"Volume mounts reference non-existent volumes: {missing_volumes}"
+        )
+
+    return True
+
+def validate_configmap_data(configmap: Dict[str, Any]) -> bool:
+    """
+    Validates ConfigMap data structure
+    
+    Args:
+        configmap: ConfigMap manifest dictionary
+    
+    Returns:
+        bool: True if valid, raises YAMLValidationError if invalid
+    """
+    if 'data' not in configmap:
+        raise YAMLValidationError("ConfigMap must contain 'data' field")
+
+    if not isinstance(configmap['data'], dict):
+        raise YAMLValidationError("ConfigMap 'data' must be a key-value mapping")
+
+    return True
+
+def validate_pvc_spec(pvc: Dict[str, Any]) -> bool:
+    """
+    Validates PersistentVolumeClaim specification
+    
+    Args:
+        pvc: PVC manifest dictionary
+    
+    Returns:
+        bool: True if valid, raises YAMLValidationError if invalid
+    """
+    required_fields = ['accessModes', 'resources']
+    for field in required_fields:
+        if field not in pvc['spec']:
+            raise YAMLValidationError(f"PVC spec must contain '{field}'")
+
+    if 'requests' not in pvc['spec']['resources']:
+        raise YAMLValidationError("PVC resources must specify 'requests'")
+
+    if 'storage' not in pvc['spec']['resources']['requests']:
+        raise YAMLValidationError("PVC must specify storage request")
+
+    return True
+
+def validate_metadata(resource: Dict[str, Any], required_labels: List[str] = None) -> bool:
+    """
+    Validates Kubernetes resource metadata
+    
+    Args:
+        resource: Kubernetes resource dictionary
+        required_labels: List of required label keys
+    
+    Returns:
+        bool: True if valid, raises YAMLValidationError if invalid
+    """
+    if 'metadata' not in resource:
+        raise YAMLValidationError("Resource must contain metadata")
+
+    metadata = resource['metadata']
+    if 'name' not in metadata:
+        raise YAMLValidationError("Resource must have a name")
+
+    if required_labels:
+        if 'labels' not in metadata:
+            raise YAMLValidationError("Resource must have labels")
+
+        missing_labels = set(required_labels) - set(metadata['labels'].keys())
+        if missing_labels:
+            raise YAMLValidationError(f"Missing required labels: {missing_labels}")
+
+    return True
+
+def validate_k8s_api_version(resource: Dict[str, Any]) -> bool:
+    """
+    Validates that the API version is valid for the resource kind
+    
+    Args:
+        resource: Kubernetes resource dictionary
+    
+    Returns:
+        bool: True if valid, raises YAMLValidationError if invalid
+    """
+    required_fields = ['apiVersion', 'kind']
+    for field in required_fields:
+        if field not in resource:
+            raise YAMLValidationError(f"Resource must specify '{field}'")
+
+    try:
+        client.ApiClient().sanitize_for_serialization(resource)
+        return True
+    except ApiException as e:
+        raise YAMLValidationError(f"Invalid resource definition: {str(e)}")