Skip to content

Commit

Permalink
Merge pull request #263 from NVIDIA/fix-error-when-multiple-exp-metri…
Browse files Browse the repository at this point in the history
…cs-enabled

Fixed defect: dcgm-exporter fails into a panic
  • Loading branch information
nvvfedorov authored Feb 23, 2024
2 parents ae83378 + e8ecd7b commit 03f3833
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 30 deletions.
113 changes: 105 additions & 8 deletions pkg/dcgmexporter/clock_events_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,14 +359,7 @@ func TestClockEventsCollector_Gather_AllTheThings(t *testing.T) {
// We get metric value with 0 index
metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)]

for i := 0; i < len(metricValues); i++ {
gpuID, err := strconv.ParseUint(metricValues[i].GPU, 10, 64)
if err == nil {
if !slices.Contains(gpuIDs, uint(gpuID)) {
metricValues = append(metricValues[:i], metricValues[i+1:]...)
}
}
}
metricValues = getFakeGPUMetrics(metricValues, gpuIDs)

// Expected 9 metric values, because we injected 9 reasons
require.Len(t, metricValues, 9)
Expand All @@ -382,3 +375,107 @@ func TestClockEventsCollector_Gather_AllTheThings(t *testing.T) {
require.Equal(t, expectedVal, val.Value)
}
}

func TestClockEventsCollector_Gather_AllTheThings_WhenNoLabels(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)
runOnlyWithLiveGPUs(t)

hostname := "local-test"
config := &Config{
GPUDevices: DeviceOptions{
Flex: true,
MajorRange: []int{-1},
MinorRange: []int{-1},
},
ClockEventsCountWindowSize: int(time.Duration(5) * time.Minute),
}

records := [][]string{
{"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""},
}

cc, err := extractCounters(records, config)
require.NoError(t, err)
require.Len(t, cc.ExporterCounters, 1)
require.Len(t, cc.DCGMCounters, 0)

// Create fake GPU
numGPUs, err := dcgm.GetAllDeviceCount()
require.NoError(t, err)

if numGPUs+1 > dcgm.MAX_NUM_DEVICES {
t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES)
}

entityList := []dcgm.MigHierarchyInfo{
{Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}},
}

gpuIDs, err := dcgm.CreateFakeEntities(entityList)
require.NoError(t, err)
require.NotEmpty(t, gpuIDs)

gpuID := gpuIDs[0]
err = dcgm.InjectFieldValue(gpuID,
dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS,
dcgm.DCGM_FT_INT64,
0,
time.Now().Add(-time.Duration(1)*time.Second).UnixMicro(),
int64(DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE|
DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING|
DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP|
DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN|
DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST|
DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL|
DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL|
DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE|
DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS),
)

require.NoError(t, err)

allCounters := []Counter{
{
FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS,
},
}

fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(allCounters, config)

err = fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU)
require.NoError(t, err)

item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU)

collector, err := NewClockEventsCollector(cc.ExporterCounters, hostname, config, item)
require.NoError(t, err)

defer func() {
collector.Cleanup()
}()

metrics, err := collector.GetMetrics()
require.NoError(t, err)
require.NotEmpty(t, metrics)
// We expect 1 metric: DCGM_EXP_CLOCK_EVENTS_COUNT
require.Len(t, metrics, 1)
// We get metric value with 0 index
metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)]
// Exclude the real GPU from the test
metricValues = getFakeGPUMetrics(metricValues, gpuIDs)
// Expected 9 metric values, because we injected 9 reasons
require.Len(t, metricValues, 9)
}

func getFakeGPUMetrics(metricValues []Metric, gpuIDs []uint) []Metric {
for i := 0; i < len(metricValues); i++ {
gpuID, err := strconv.ParseUint(metricValues[i].GPU, 10, 64)
if err == nil {
if !slices.Contains(gpuIDs, uint(gpuID)) {
metricValues = append(metricValues[:i], metricValues[i+1:]...)
}
}
}
return metricValues
}
51 changes: 29 additions & 22 deletions pkg/dcgmexporter/expcollector.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"fmt"
"io"
"maps"
"slices"
"sync"
"sync/atomic"
"text/template"
Expand Down Expand Up @@ -132,29 +131,12 @@ func (c *expCollector) getMetrics() (MetricsByCounter, error) {
uuid = "uuid"
}
for _, mi := range monitoringInfo {
latestValues, err := dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.labelDeviceFields)
if err != nil {
return nil, err
}
// Extract Labels
for _, val := range latestValues {
v := ToString(val)
// Filter out counters with no value and ignored fields for this entity
if v == SkipDCGMValue {
continue
}

counter, err := FindCounterField(c.labelsCounters, val.FieldId)
if len(c.labelsCounters) > 0 {
err := c.getLabelsFromCounters(mi, labels)
if err != nil {
continue
}

if counter.PromType == "label" {
labels[counter.FieldName] = v
continue
return nil, err
}
}

entityValues, exists := mapEntityIDToValues[mi.DeviceInfo.GPU]
if exists {
for entityValue, val := range entityValues {
Expand Down Expand Up @@ -200,6 +182,32 @@ func (c *expCollector) getMetrics() (MetricsByCounter, error) {
return metrics, nil
}

func (c *expCollector) getLabelsFromCounters(mi MonitoringInfo, labels map[string]string) error {
latestValues, err := dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.labelDeviceFields)
if err != nil {
return err
}
// Extract Labels
for _, val := range latestValues {
v := ToString(val)
// Filter out counters with no value and ignored fields for this entity
if v == SkipDCGMValue {
continue
}

counter, err := FindCounterField(c.labelsCounters, val.FieldId)
if err != nil {
continue
}

if counter.PromType == "label" {
labels[counter.FieldName] = v
continue
}
}
return nil
}

func (c *expCollector) Cleanup() {
for _, cleanup := range c.cleanups {
cleanup()
Expand All @@ -218,7 +226,6 @@ func newExpCollector(
for i := 0; i < len(counters); i++ {
if counters[i].PromType == "label" {
labelsCounters = append(labelsCounters, counters[i])
counters = slices.Delete(counters, i, i+1)
}
}

Expand Down

0 comments on commit 03f3833

Please sign in to comment.