Skip to content

Commit

Permalink
Gracefully handle panics and invalid fields
Browse files Browse the repository at this point in the history
The purpose of this PR is handle panics and errors gracefully in DCGM Exporter. The fix includes following modifications:

1. Added recover functions to properly handle panic situations.
2. Fixed an issue where invalid field would result in a panic, instead it returns an error that shows up in logs.
3. Amended .gitignore file.
4. Added Unit tests to verify above mentioned fixes.
  • Loading branch information
rohit-arora-dev authored Feb 13, 2024
1 parent 3250bfe commit 6f93e31
Show file tree
Hide file tree
Showing 8 changed files with 358 additions and 49 deletions.
226 changes: 225 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,229 @@
dcgm-exporter
!etc/
!deployment/
.env
*.pem
*.csr
vendor/

###############################################################################
# JetBrains
# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore
###############################################################################
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# AWS User-specific
.idea/**/aws.xml

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# SonarLint plugin
.idea/sonarlint/

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser

###############################################################################
# JetBrains
# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore
###############################################################################
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets

# Local History for Visual Studio Code
.history/

# Built Visual Studio Code Extensions
*.vsix

###############################################################################
# Sublime Text
# https://github.com/github/gitignore/blob/master/Global/SublimeText.gitignore
###############################################################################

# cache files for sublime text
*.tmlanguage.cache
*.tmPreferences.cache
*.stTheme.cache

# workspace files are user-specific
*.sublime-workspace

# project files should be checked into the repository, unless a significant
# proportion of contributors will probably not be using SublimeText
# *.sublime-project

# sftp configuration file
sftp-config.json

###############################################################################
# Vim
# https://github.com/github/gitignore/blob/master/Global/Vim.gitignore
###############################################################################

# Swap
[._]*.s[a-v][a-z]
!*.svg # comment out if you don't need vector files
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]

# Session
Session.vim
Sessionx.vim

# Temporary
.netrwhist
*~
# Auto-generated tag files
tags
.env
# Persistent undo
[._]*.un~

###############################################################################
# Linux
# https://github.com/github/gitignore/blob/master/Global/Linux.gitignore
###############################################################################
*~

# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*

# KDE directory preferences
.directory

# Linux trash folder which might appear on any partition or disk
.Trash-*

# .nfs files are created when an open file is removed but is still being accessed
.nfs*

###############################################################################
# OS X
# https://github.com/github/gitignore/blob/main/Global/macOS.gitignore
###############################################################################
# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon

# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

###############################################################################
# Windows
# https://github.com/github/gitignore/blob/master/Global/Windows.gitignore
###############################################################################
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db

# Dump file
*.stackdump

# Folder config file
[Dd]esktop.ini

# Recycle Bin used on file shares
$RECYCLE.BIN/

# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp

# Windows shortcuts
*.lnk
21 changes: 0 additions & 21 deletions .vscode/launch.json

This file was deleted.

4 changes: 0 additions & 4 deletions .vscode/settings.json

This file was deleted.

17 changes: 14 additions & 3 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,19 @@ import (
"os"
"os/signal"
"runtime"
"runtime/debug"
"strconv"
"strings"
"sync"
"syscall"
"text/template"
"time"

"github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter"
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"

"github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter"
)

const (
Expand Down Expand Up @@ -226,17 +228,26 @@ func newOSWatcher(sigs ...os.Signal) chan os.Signal {
return sigChan
}

func action(c *cli.Context) error {
func action(c *cli.Context) (err error) {
restart:

// The purpose of this function is to capture any panic that may occur
// during initialization and return an error.
defer func() {
if r := recover(); r != nil {
logrus.WithField(dcgmexporter.LoggerStackTrace, string(debug.Stack())).Error("Encountered a failure.")
err = fmt.Errorf("Encountered a failure: %v", r)
}
}()

logrus.Info("Starting dcgm-exporter")
config, err := contextToConfig(c)
if err != nil {
return err
}

if config.Debug {
//enable debug logging
// enable debug logging
logrus.SetLevel(logrus.DebugLevel)
logrus.Debug("Debug output is enabled")
}
Expand Down
23 changes: 13 additions & 10 deletions pkg/dcgmexporter/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,32 +25,35 @@ const (
DCGMXIDErrorsCount DCGMExporterMetric = iota + 9000
)

// DCGMFields maps DCGMExporterMetric String to enum
var DCGMFields = map[string]DCGMExporterMetric{
DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount,
DCGMFIUnknown.String(): DCGMFIUnknown,
}

// String method to convert the enum value to a string
func (enm DCGMExporterMetric) String() string {
switch enm {
func (d DCGMExporterMetric) String() string {
switch d {
case DCGMXIDErrorsCount:
return "DCGM_EXP_XID_ERRORS_COUNT"
default:
return "DCGM_FI_UNKNOWN"
}
}

func mustParseDCGMExporterMetric(s string) DCGMExporterMetric {
metrics := map[string]DCGMExporterMetric{
DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount,
DCGMFIUnknown.String(): DCGMFIUnknown,
}
mv, ok := metrics[s]
func IdentifyMetricType(s string) (DCGMExporterMetric, error) {
mv, ok := DCGMFields[s]
if !ok {
panic(fmt.Sprintf(`cannot parse:[%s] as DCGMExporterMetric`, s))
return mv, fmt.Errorf("Unknown DCGMExporterMetric field '%s'", s)
}
return mv
return mv, nil
}

// Constants for logging fields
const (
LoggerGroupIDKey = "groupID"
LoggerDumpKey = "dump"
LoggerStackTrace = "stacktrace"
)

const (
Expand Down
Loading

0 comments on commit 6f93e31

Please sign in to comment.