diff --git a/pkg/dcgmexporter/clock_events_collector_test.go b/pkg/dcgmexporter/clock_events_collector_test.go index 496fbcfb..cde826d3 100644 --- a/pkg/dcgmexporter/clock_events_collector_test.go +++ b/pkg/dcgmexporter/clock_events_collector_test.go @@ -359,14 +359,7 @@ func TestClockEventsCollector_Gather_AllTheThings(t *testing.T) { // We get metric value with 0 index metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] - for i := 0; i < len(metricValues); i++ { - gpuID, err := strconv.ParseUint(metricValues[i].GPU, 10, 64) - if err == nil { - if !slices.Contains(gpuIDs, uint(gpuID)) { - metricValues = append(metricValues[:i], metricValues[i+1:]...) - } - } - } + metricValues = getFakeGPUMetrics(metricValues, gpuIDs) // Expected 9 metric values, because we injected 9 reasons require.Len(t, metricValues, 9) @@ -382,3 +375,107 @@ func TestClockEventsCollector_Gather_AllTheThings(t *testing.T) { require.Equal(t, expectedVal, val.Value) } } + +func TestClockEventsCollector_Gather_AllTheThings_WhenNoLabels(t *testing.T) { + teardownTest := setupTest(t) + defer teardownTest(t) + runOnlyWithLiveGPUs(t) + + hostname := "local-test" + config := &Config{ + GPUDevices: DeviceOptions{ + Flex: true, + MajorRange: []int{-1}, + MinorRange: []int{-1}, + }, + ClockEventsCountWindowSize: int(time.Duration(5) * time.Minute), + } + + records := [][]string{ + {"DCGM_EXP_CLOCK_EVENTS_COUNT", "gauge", ""}, + } + + cc, err := extractCounters(records, config) + require.NoError(t, err) + require.Len(t, cc.ExporterCounters, 1) + require.Len(t, cc.DCGMCounters, 0) + + // Create fake GPU + numGPUs, err := dcgm.GetAllDeviceCount() + require.NoError(t, err) + + if numGPUs+1 > dcgm.MAX_NUM_DEVICES { + t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) + } + + entityList := []dcgm.MigHierarchyInfo{ + {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, + } + + gpuIDs, err := dcgm.CreateFakeEntities(entityList) + require.NoError(t, err) + require.NotEmpty(t, gpuIDs) + + gpuID := gpuIDs[0] + err = dcgm.InjectFieldValue(gpuID, + dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + dcgm.DCGM_FT_INT64, + 0, + time.Now().Add(-time.Duration(1)*time.Second).UnixMicro(), + int64(DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE| + DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING| + DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP| + DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN| + DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST| + DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL| + DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL| + DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE| + DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS), + ) + + require.NoError(t, err) + + allCounters := []Counter{ + { + FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + }, + } + + fieldEntityGroupTypeSystemInfo := NewEntityGroupTypeSystemInfo(allCounters, config) + + err = fieldEntityGroupTypeSystemInfo.Load(dcgm.FE_GPU) + require.NoError(t, err) + + item, _ := fieldEntityGroupTypeSystemInfo.Get(dcgm.FE_GPU) + + collector, err := NewClockEventsCollector(cc.ExporterCounters, hostname, config, item) + require.NoError(t, err) + + defer func() { + collector.Cleanup() + }() + + metrics, err := collector.GetMetrics() + require.NoError(t, err) + require.NotEmpty(t, metrics) + // We expect 1 metric: DCGM_EXP_CLOCK_EVENTS_COUNT + require.Len(t, metrics, 1) + // We get metric value with 0 index + metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] + // Exclude the real GPU from the test + metricValues = getFakeGPUMetrics(metricValues, gpuIDs) + // Expected 9 metric values, because we injected 9 reasons + require.Len(t, metricValues, 9) +} + +func getFakeGPUMetrics(metricValues []Metric, gpuIDs []uint) []Metric { + for i := 0; i < len(metricValues); i++ { + gpuID, err := strconv.ParseUint(metricValues[i].GPU, 10, 64) + if err == nil { + if !slices.Contains(gpuIDs, uint(gpuID)) { + metricValues = append(metricValues[:i], metricValues[i+1:]...) + } + } + } + return metricValues +} diff --git a/pkg/dcgmexporter/expcollector.go b/pkg/dcgmexporter/expcollector.go index c075eb50..93d4d179 100644 --- a/pkg/dcgmexporter/expcollector.go +++ b/pkg/dcgmexporter/expcollector.go @@ -20,7 +20,6 @@ import ( "fmt" "io" "maps" - "slices" "sync" "sync/atomic" "text/template" @@ -132,29 +131,12 @@ func (c *expCollector) getMetrics() (MetricsByCounter, error) { uuid = "uuid" } for _, mi := range monitoringInfo { - latestValues, err := dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.labelDeviceFields) - if err != nil { - return nil, err - } - // Extract Labels - for _, val := range latestValues { - v := ToString(val) - // Filter out counters with no value and ignored fields for this entity - if v == SkipDCGMValue { - continue - } - - counter, err := FindCounterField(c.labelsCounters, val.FieldId) + if len(c.labelsCounters) > 0 { + err := c.getLabelsFromCounters(mi, labels) if err != nil { - continue - } - - if counter.PromType == "label" { - labels[counter.FieldName] = v - continue + return nil, err } } - entityValues, exists := mapEntityIDToValues[mi.DeviceInfo.GPU] if exists { for entityValue, val := range entityValues { @@ -200,6 +182,32 @@ func (c *expCollector) getMetrics() (MetricsByCounter, error) { return metrics, nil } +func (c *expCollector) getLabelsFromCounters(mi MonitoringInfo, labels map[string]string) error { + latestValues, err := dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.labelDeviceFields) + if err != nil { + return err + } + // Extract Labels + for _, val := range latestValues { + v := ToString(val) + // Filter out counters with no value and ignored fields for this entity + if v == SkipDCGMValue { + continue + } + + counter, err := FindCounterField(c.labelsCounters, val.FieldId) + if err != nil { + continue + } + + if counter.PromType == "label" { + labels[counter.FieldName] = v + continue + } + } + return nil +} + func (c *expCollector) Cleanup() { for _, cleanup := range c.cleanups { cleanup() @@ -218,7 +226,6 @@ func newExpCollector( for i := 0; i < len(counters); i++ { if counters[i].PromType == "label" { labelsCounters = append(labelsCounters, counters[i]) - counters = slices.Delete(counters, i, i+1) } }