From 6f93e31db96d1b7bf1708f76d0d821e905bbae38 Mon Sep 17 00:00:00 2001 From: Rohit Arora <49132604+rohit-arora-dev@users.noreply.github.com> Date: Tue, 13 Feb 2024 11:09:57 -0500 Subject: [PATCH] Gracefully handle panics and invalid fields The purpose of this PR is handle panics and errors gracefully in DCGM Exporter. The fix includes following modifications: 1. Added recover functions to properly handle panic situations. 2. Fixed an issue where invalid field would result in a panic, instead it returns an error that shows up in logs. 3. Amended .gitignore file. 4. Added Unit tests to verify above mentioned fixes. --- .gitignore | 226 +++++++++++++++++++++++++++++++- .vscode/launch.json | 21 --- .vscode/settings.json | 4 - pkg/cmd/app.go | 17 ++- pkg/dcgmexporter/const.go | 23 ++-- pkg/dcgmexporter/const_test.go | 63 +++++++++ pkg/dcgmexporter/parser.go | 13 +- pkg/dcgmexporter/parser_test.go | 40 +++++- 8 files changed, 358 insertions(+), 49 deletions(-) delete mode 100644 .vscode/launch.json delete mode 100644 .vscode/settings.json create mode 100644 pkg/dcgmexporter/const_test.go diff --git a/.gitignore b/.gitignore index 0e02e461..4dfcd525 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,229 @@ dcgm-exporter !etc/ !deployment/ +.env +*.pem +*.csr +vendor/ + +############################################################################### +# JetBrains +# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore +############################################################################### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +############################################################################### +# JetBrains +# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore +############################################################################### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +############################################################################### +# Sublime Text +# https://github.com/github/gitignore/blob/master/Global/SublimeText.gitignore +############################################################################### + +# cache files for sublime text +*.tmlanguage.cache +*.tmPreferences.cache +*.stTheme.cache + +# workspace files are user-specific +*.sublime-workspace + +# project files should be checked into the repository, unless a significant +# proportion of contributors will probably not be using SublimeText +# *.sublime-project + +# sftp configuration file +sftp-config.json + +############################################################################### +# Vim +# https://github.com/github/gitignore/blob/master/Global/Vim.gitignore +############################################################################### + +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +*~ +# Auto-generated tag files tags -.env \ No newline at end of file +# Persistent undo +[._]*.un~ + +############################################################################### +# Linux +# https://github.com/github/gitignore/blob/master/Global/Linux.gitignore +############################################################################### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +############################################################################### +# OS X +# https://github.com/github/gitignore/blob/main/Global/macOS.gitignore +############################################################################### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +############################################################################### +# Windows +# https://github.com/github/gitignore/blob/master/Global/Windows.gitignore +############################################################################### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index a4f0acbf..00000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "name": "Run Debug", - "type": "go", - "request": "launch", - "mode": "debug", - "cwd": "${workspaceRoot}", - "program": "cmd/dcgm-exporter/main.go", - "args": [ - "-f", - "./etc/default-counters.csv", - "--debug" - ] - } - ] -} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 890370b4..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "go.testTimeout": "60s", - "go.testFlags": ["-v","-count=1"] -} \ No newline at end of file diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index c90a585f..7aa77024 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -6,6 +6,7 @@ import ( "os" "os/signal" "runtime" + "runtime/debug" "strconv" "strings" "sync" @@ -13,10 +14,11 @@ import ( "text/template" "time" - "github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter" "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" + + "github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter" ) const ( @@ -226,9 +228,18 @@ func newOSWatcher(sigs ...os.Signal) chan os.Signal { return sigChan } -func action(c *cli.Context) error { +func action(c *cli.Context) (err error) { restart: + // The purpose of this function is to capture any panic that may occur + // during initialization and return an error. + defer func() { + if r := recover(); r != nil { + logrus.WithField(dcgmexporter.LoggerStackTrace, string(debug.Stack())).Error("Encountered a failure.") + err = fmt.Errorf("Encountered a failure: %v", r) + } + }() + logrus.Info("Starting dcgm-exporter") config, err := contextToConfig(c) if err != nil { @@ -236,7 +247,7 @@ restart: } if config.Debug { - //enable debug logging + // enable debug logging logrus.SetLevel(logrus.DebugLevel) logrus.Debug("Debug output is enabled") } diff --git a/pkg/dcgmexporter/const.go b/pkg/dcgmexporter/const.go index 4286cea8..6036e32a 100644 --- a/pkg/dcgmexporter/const.go +++ b/pkg/dcgmexporter/const.go @@ -25,9 +25,15 @@ const ( DCGMXIDErrorsCount DCGMExporterMetric = iota + 9000 ) +// DCGMFields maps DCGMExporterMetric String to enum +var DCGMFields = map[string]DCGMExporterMetric{ + DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount, + DCGMFIUnknown.String(): DCGMFIUnknown, +} + // String method to convert the enum value to a string -func (enm DCGMExporterMetric) String() string { - switch enm { +func (d DCGMExporterMetric) String() string { + switch d { case DCGMXIDErrorsCount: return "DCGM_EXP_XID_ERRORS_COUNT" default: @@ -35,22 +41,19 @@ func (enm DCGMExporterMetric) String() string { } } -func mustParseDCGMExporterMetric(s string) DCGMExporterMetric { - metrics := map[string]DCGMExporterMetric{ - DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount, - DCGMFIUnknown.String(): DCGMFIUnknown, - } - mv, ok := metrics[s] +func IdentifyMetricType(s string) (DCGMExporterMetric, error) { + mv, ok := DCGMFields[s] if !ok { - panic(fmt.Sprintf(`cannot parse:[%s] as DCGMExporterMetric`, s)) + return mv, fmt.Errorf("Unknown DCGMExporterMetric field '%s'", s) } - return mv + return mv, nil } // Constants for logging fields const ( LoggerGroupIDKey = "groupID" LoggerDumpKey = "dump" + LoggerStackTrace = "stacktrace" ) const ( diff --git a/pkg/dcgmexporter/const_test.go b/pkg/dcgmexporter/const_test.go new file mode 100644 index 00000000..aaa9834f --- /dev/null +++ b/pkg/dcgmexporter/const_test.go @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIdentifyMetricType(t *testing.T) { + tests := []struct { + name string + field string + output DCGMExporterMetric + valid bool + }{ + { + name: "Valid Input DCGM_EXP_XID_ERRORS_COUNT", + field: "DCGM_EXP_XID_ERRORS_COUNT", + output: DCGMXIDErrorsCount, + valid: true, + }, + { + name: "Valid Input DCGM_FI_UNKNOWN", + field: "DCGM_FI_UNKNOWN", + output: DCGMFIUnknown, + valid: true, + }, + { + name: "Invalid Input DCGM_EXP_XID_ERRORS_COUNTXXX", + field: "DCGM_EXP_XID_ERRORS_COUNTXXX", + output: DCGMFIUnknown, + valid: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + output, err := IdentifyMetricType(tt.field) + if tt.valid { + assert.NoError(t, err, "Expected metrics to be found.") + assert.Equal(t, output, tt.output, "Invalid output") + } else { + assert.Errorf(t, err, "Expected metrics to be not found") + } + }) + } +} diff --git a/pkg/dcgmexporter/parser.go b/pkg/dcgmexporter/parser.go index 0c76bab9..17eadfc4 100644 --- a/pkg/dcgmexporter/parser.go +++ b/pkg/dcgmexporter/parser.go @@ -52,7 +52,7 @@ func ExtractCounters(c *Config) ([]Counter, []Counter, error) { logrus.Fatal(err) } } else { - err = fmt.Errorf("No configmap data specified") + err = fmt.Errorf("no configmap data specified") } if err != nil { @@ -103,19 +103,20 @@ func extractCounters(records [][]string, c *Config) ([]Counter, []Counter, error } if len(record) != 3 { - return nil, nil, fmt.Errorf("Malformed CSV record, failed to parse line %d (`%v`), expected 3 fields", i, record) + return nil, nil, fmt.Errorf("Malformed CSV record, failed to parse line %d (`%v`), expected 3 fields", i, + record) } fieldID, ok := dcgm.DCGM_FI[record[0]] oldFieldID, oldOk := dcgm.OLD_DCGM_FI[record[0]] if !ok && !oldOk { - expField := mustParseDCGMExporterMetric(record[0]) - if expField != DCGMFIUnknown { + expField, err := IdentifyMetricType(record[0]) + if err != nil { + return nil, nil, fmt.Errorf("could not find DCGM field; err: %v", err) + } else if expField != DCGMFIUnknown { expf = append(expf, Counter{dcgm.Short(expField), record[0], record[1], record[2]}) continue - } else { - return nil, nil, fmt.Errorf("Could not find DCGM field %s", record[0]) } } diff --git a/pkg/dcgmexporter/parser_test.go b/pkg/dcgmexporter/parser_test.go index eff8197c..ef70de55 100644 --- a/pkg/dcgmexporter/parser_test.go +++ b/pkg/dcgmexporter/parser_test.go @@ -4,6 +4,7 @@ import ( "os" "testing" + "github.com/stretchr/testify/assert" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/fake" @@ -99,6 +100,32 @@ func TestInvalidConfigMapNamespace(t *testing.T) { } func TestExtractCounters(t *testing.T) { + tests := []struct { + name string + field string + valid bool + }{ + { + name: "Valid Input DCGM_FI_DEV_GPU_TEMP", + field: "DCGM_FI_DEV_GPU_TEMP, gauge, temperature\n", + valid: true, + }, + { + name: "Invalid Input DCGM_EXP_XID_ERRORS_COUNTXXX", + field: "DCGM_EXP_XID_ERRORS_COUNTXXX, gauge, temperature\n", + valid: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + extractCountersHelper(t, tt.field, tt.valid) + }) + } + +} + +func extractCountersHelper(t *testing.T, input string, valid bool) { tmpFile, err := os.CreateTemp(os.TempDir(), "prefix-") if err != nil { t.Fatalf("Cannot create temporary file: %v", err) @@ -106,7 +133,7 @@ func TestExtractCounters(t *testing.T) { defer os.Remove(tmpFile.Name()) - text := []byte("DCGM_FI_DEV_GPU_TEMP, gauge, temperature\n") + text := []byte(input) if _, err = tmpFile.Write(text); err != nil { t.Fatalf("Failed to write to temporary file: %v", err) } @@ -121,8 +148,13 @@ func TestExtractCounters(t *testing.T) { ConfigMapData: undefinedConfigMapData, CollectorsFile: tmpFile.Name(), } - records, _, err := ExtractCounters(&c) - if len(records) != 1 || err != nil { - t.Fatalf("Should have succeeded: records (%d != 1) err=%v", len(records), err) + records, extraCounters, err := ExtractCounters(&c) + if valid { + assert.NoError(t, err, "Expected no error.") + assert.Equal(t, 1, len(records), "Expected 1 record counters.") + } else { + assert.Error(t, err, "Expected error.") + assert.Equal(t, 0, len(records), "Expected no counters.") + assert.Equal(t, 0, len(extraCounters), "Expected no extra counters.") } }