Skip to content

Commit 1aaef05

Browse files
Jenkins Pipeline updates for Canceling Jobs (NOAA-EMC#2307)
Tuning updates for Jenkins Pipeline : - Added short circuit for all parallel runs of cases on error of any - Fixed canceling of all scheduled jobs on first case error - Added feature to save error log files to Jenkins Archive facility on fail
1 parent 6404892 commit 1aaef05

File tree

4 files changed

+59
-39
lines changed

4 files changed

+59
-39
lines changed

Jenkinsfile

+40-31
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ pipeline {
99

1010
options {
1111
skipDefaultCheckout()
12-
buildDiscarder(logRotator(numToKeepStr: '2'))
12+
parallelsAlwaysFailFast()
1313
}
1414

1515
stages { // This initial stage is used to get the Machine name from the GitHub labels on the PR
@@ -45,14 +45,14 @@ pipeline {
4545
properties([parameters([[$class: 'NodeParameterDefinition', allowedSlaves: ['built-in', 'Hera-EMC', 'Orion-EMC'], defaultSlaves: ['built-in'], name: '', nodeEligibility: [$class: 'AllNodeEligibility'], triggerIfResult: 'allCases']])])
4646
HOME = "${WORKSPACE}/TESTDIR"
4747
commonworkspace = "${WORKSPACE}"
48-
sh(script: "mkdir -p ${HOME}/RUNTESTS", returnStatus: true)
48+
sh(script: "mkdir -p ${HOME}/RUNTESTS")
4949
pullRequest.addLabel("CI-${Machine}-Building")
5050
if (pullRequest.labels.any { value -> value.matches("CI-${Machine}-Ready") }) {
5151
pullRequest.removeLabel("CI-${Machine}-Ready")
52+
}
5253
}
5354
}
5455
}
55-
}
5656

5757
stage('Build System') {
5858
matrix {
@@ -71,35 +71,41 @@ pipeline {
7171
steps {
7272
script {
7373
def HOMEgfs = "${HOME}/${system}" // local HOMEgfs is used to build the system on per system basis under the common workspace HOME
74-
sh(script: "mkdir -p ${HOMEgfs}", returnStatus: true)
74+
sh(script: "mkdir -p ${HOMEgfs}")
7575
ws(HOMEgfs) {
7676
env.MACHINE_ID = machine // MACHINE_ID is used in the build scripts to determine the machine and is added to the shell environment
7777
if (fileExists("${HOMEgfs}/sorc/BUILT_semaphor")) { // if the system is already built, skip the build in the case of re-runs
7878
sh(script: "cat ${HOMEgfs}/sorc/BUILT_semaphor", returnStdout: true).trim() // TODO: and user configurable control to manage build semphore
79-
ws(commonworkspace) { pullRequest.comment("Cloned PR already built (or build skipped) on ${machine} in directory ${HOMEgfs}") }
79+
pullRequest.comment("Cloned PR already built (or build skipped) on ${machine} in directory ${HOMEgfs}<br>Still doing a checkout to get the latest changes")
80+
sh(script: 'source workflow/gw_setup.sh; git pull --recurse-submodules')
81+
dir('sorc') {
82+
sh(script: './link_workflow.sh')
83+
}
8084
} else {
8185
checkout scm
82-
sh(script: 'source workflow/gw_setup.sh;which git;git --version;git submodule update --init --recursive', returnStatus: true)
86+
sh(script: 'source workflow/gw_setup.sh;which git;git --version;git submodule update --init --recursive')
8387
def builds_file = readYaml file: 'ci/cases/yamls/build.yaml'
8488
def build_args_list = builds_file['builds']
8589
def build_args = build_args_list[system].join(' ').trim().replaceAll('null', '')
8690
dir("${HOMEgfs}/sorc") {
87-
sh(script: "${build_args}", returnStatus: true)
88-
sh(script: './link_workflow.sh', returnStatus: true)
89-
sh(script: "echo ${HOMEgfs} > BUILT_semaphor", returnStatus: true)
91+
sh(script: "${build_args}")
92+
sh(script: './link_workflow.sh')
93+
sh(script: "echo ${HOMEgfs} > BUILT_semaphor")
9094
}
9195
}
92-
if (pullRequest.labels.any { value -> value.matches("CI-${Machine}-Building") }) {
93-
pullRequest.removeLabel("CI-${Machine}-Building")
94-
}
95-
pullRequest.addLabel("CI-${Machine}-Running")
96-
}
96+
if (env.CHANGE_ID && system == 'gfs') {
97+
if (pullRequest.labels.any { value -> value.matches("CI-${Machine}-Building") }) {
98+
pullRequest.removeLabel("CI-${Machine}-Building")
99+
}
100+
pullRequest.addLabel("CI-${Machine}-Running")
101+
}
102+
}
103+
}
97104
}
98105
}
99106
}
100107
}
101108
}
102-
}
103109

104110
stage('Run Tests') {
105111
matrix {
@@ -108,19 +114,19 @@ pipeline {
108114
axis {
109115
name 'Case'
110116
// TODO add dynamic list of cases from env vars (needs addtional plugins)
111-
values 'C48_ATM', 'C48_S2SWA_gefs', 'C48_S2SW', 'C96_atm3DVar', 'C48mx500_3DVarAOWCDA', 'C96C48_hybatmDA', 'C96_atmsnowDA'
117+
values 'C48_ATM', 'C48_S2SWA_gefs', 'C48_S2SW', 'C96_atm3DVar', 'C96C48_hybatmDA', 'C96_atmsnowDA' // 'C48mx500_3DVarAOWCDA'
112118
}
113119
}
114120
stages {
115121
stage('Create Experiment') {
116122
steps {
117123
script {
118-
sh(script: "sed -n '/{.*}/!p' ${HOME}/gfs/ci/cases/pr/${Case}.yaml > ${HOME}/gfs/ci/cases/pr/${Case}.yaml.tmp", returnStatus: true)
124+
sh(script: "sed -n '/{.*}/!p' ${HOME}/gfs/ci/cases/pr/${Case}.yaml > ${HOME}/gfs/ci/cases/pr/${Case}.yaml.tmp")
119125
def yaml_case = readYaml file: "${HOME}/gfs/ci/cases/pr/${Case}.yaml.tmp"
120126
system = yaml_case.experiment.system
121127
def HOMEgfs = "${HOME}/${system}" // local HOMEgfs is used to populate the XML on per system basis
122128
env.RUNTESTS = "${HOME}/RUNTESTS"
123-
sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh create_experiment ${HOMEgfs}/ci/cases/pr/${Case}.yaml", returnStatus: true)
129+
sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh create_experiment ${HOMEgfs}/ci/cases/pr/${Case}.yaml")
124130
}
125131
}
126132
}
@@ -130,16 +136,27 @@ pipeline {
130136
HOMEgfs = "${HOME}/gfs" // common HOMEgfs is used to launch the scripts that run the experiments
131137
ws(HOMEgfs) {
132138
pslot = sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh get_pslot ${HOME}/RUNTESTS ${Case}", returnStdout: true).trim()
133-
pullRequest.comment("**Running** experiment: ${Case} on ${Machine}<br>With the experiment in directory:<br>`${HOME}/RUNTESTS/${pslot}`")
134-
try {
135-
sh(script: "${HOMEgfs}/ci/scripts/run-check_ci.sh ${HOME} ${pslot}", returnStatus: true)
136-
} catch (Exception e) {
139+
// pullRequest.comment("**Running** experiment: ${Case} on ${Machine}<br>With the experiment in directory:<br>`${HOME}/RUNTESTS/${pslot}`")
140+
err = sh(script: "${HOMEgfs}/ci/scripts/run-check_ci.sh ${HOME} ${pslot}")
141+
if (err != 0) {
137142
pullRequest.comment("**FAILURE** running experiment: ${Case} on ${Machine}")
143+
sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh cancel_all_batch_jobs ${HOME}/RUNTESTS")
144+
ws(HOME) {
145+
if (fileExists('RUNTESTS/error.logs')) {
146+
def fileContent = readFile 'RUNTESTS/error.logs'
147+
def lines = fileContent.readLines()
148+
for (line in lines) {
149+
echo "archiving: ${line}"
150+
archiveArtifacts artifacts: "${line}", fingerprint: true
151+
}
152+
}
153+
}
138154
error("Failed to run experiments ${Case} on ${Machine}")
139155
}
140-
pullRequest.comment("**SUCCESS** running experiment: ${Case} on ${Machine}")
156+
// pullRequest.comment("**SUCCESS** running experiment: ${Case} on ${Machine}")
141157
}
142158
}
159+
143160
}
144161
}
145162
}
@@ -175,14 +192,6 @@ pipeline {
175192
def timestamp = new Date().format('MM dd HH:mm:ss', TimeZone.getTimeZone('America/New_York'))
176193
pullRequest.comment("**CI FAILED** ${Machine} at ${timestamp}<br>Built and ran in directory `${HOME}`")
177194
}
178-
if (fileExists('${HOME}/RUNTESTS/ci.log')) {
179-
def fileContent = readFile '${HOME}/RUNTESTS/ci.log'
180-
fileContent.eachLine { line ->
181-
if (line.contains('.log')) {
182-
archiveArtifacts artifacts: "${line}", fingerprint: true
183-
}
184-
}
185-
}
186195
}
187196
}
188197
}

ci/cases/pr/C48mx500_3DVarAOWCDA.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ arguments:
1717
start: warm
1818
yaml: {{ HOMEgfs }}/ci/cases/yamls/soca_gfs_defaults_ci.yaml
1919

20-
skip_ci_on_hosts:
20+
skip_ci_on_host:
2121
- orion
22+
- hera
2223
- hercules

ci/scripts/run-check_ci.sh

+9-7
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ pslot=${2:-${pslot:-?}} # Name of the experiment being tested by this scr
2525
# TODO: Make this configurable (for now all scripts run from gfs for CI at runtime)
2626
HOMEgfs="${TEST_DIR}/gfs"
2727
RUNTESTS="${TEST_DIR}/RUNTESTS"
28+
run_check_logfile="${RUNTESTS}/ci-run_check.log"
2829

2930
# Source modules and setup logging
3031
echo "Source modules."
@@ -77,15 +78,16 @@ while true; do
7778
{
7879
echo "Experiment ${pslot} Terminated with ${num_failed} tasks failed at $(date)" || true
7980
echo "Experiment ${pslot} Terminated: *FAILED*"
80-
} >> "${RUNTESTS}/ci.log"
81-
81+
} | tee -a "${run_check_logfile}"
8282
error_logs=$(rocotostat -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs rocotocheck -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true
8383
{
8484
echo "Error logs:"
8585
echo "${error_logs}"
86-
} >> "${RUNTESTS}/ci.log"
87-
sed -i "s/\`\`\`//2g" "${RUNTESTS}/ci.log"
88-
sacct --format=jobid,jobname%35,WorkDir%100,stat | grep "${pslot}" | grep "${pr}\/RUNTESTS" | awk '{print $1}' | xargs scancel || true
86+
} | tee -a "${run_check_logfile}"
87+
# rm -f "${RUNTESTS}/error.logs"
88+
for log in ${error_logs}; do
89+
echo "RUNTESTS${log#*RUNTESTS}" >> "${RUNTESTS}/error.logs"
90+
done
8991
rc=1
9092
break
9193
fi
@@ -95,8 +97,7 @@ while true; do
9597
echo "Experiment ${pslot} Completed at $(date)" || true
9698
echo "with ${num_succeeded} successfully completed jobs" || true
9799
echo "Experiment ${pslot} Completed: *SUCCESS*"
98-
} >> "${RUNTESTS}/ci.log"
99-
sed -i "s/\`\`\`//2g" "${RUNTESTS}/ci.log"
100+
} | tee -a "${run_check_logfile}"
100101
rc=0
101102
break
102103
fi
@@ -107,3 +108,4 @@ while true; do
107108
done
108109

109110
exit "${rc}"
111+

ci/scripts/utils/ci_utils.sh

+8
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,14 @@ function get_pslot () {
102102

103103
}
104104

105+
function cancel_all_batch_jobs () {
106+
local RUNTESTS="${1}"
107+
pslot_list=$(get_pslot_list "${RUNTESTS}")
108+
for pslot in ${pslot_list}; do
109+
cancel_batch_jobs "${pslot}"
110+
done
111+
}
112+
105113
function create_experiment () {
106114

107115
local yaml_config="${1}"

0 commit comments

Comments
 (0)