diff --git a/cluster-autoscaler/cloudprovider/gce/templates.go b/cluster-autoscaler/cloudprovider/gce/templates.go index 8a08d5dc02d9..b885c419d794 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates.go +++ b/cluster-autoscaler/cloudprovider/gce/templates.go @@ -75,10 +75,19 @@ func (t *templateBuilder) getCpuAndMemoryForMachineType(machineType string) (cpu return machine.GuestCpus, machine.MemoryMb * 1024 * 1024, nil } -func (t *templateBuilder) buildCapacity(machineType string) (apiv1.ResourceList, error) { +func (t *templateBuilder) getAcceleratorCount(accelerators []*gce.AcceleratorConfig) int64 { + count := int64(0) + for _, accelerator := range accelerators { + if strings.HasPrefix(accelerator.AcceleratorType, "nvidia-") { + count += accelerator.AcceleratorCount + } + } + return count +} + +func (t *templateBuilder) buildCapacity(machineType string, accelerators []*gce.AcceleratorConfig) (apiv1.ResourceList, error) { capacity := apiv1.ResourceList{} // TODO: get a real value. - // TODO: handle GPU capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI) cpu, mem, err := t.getCpuAndMemoryForMachineType(machineType) @@ -87,6 +96,11 @@ func (t *templateBuilder) buildCapacity(machineType string) (apiv1.ResourceList, } capacity[apiv1.ResourceCPU] = *resource.NewQuantity(cpu, resource.DecimalSI) capacity[apiv1.ResourceMemory] = *resource.NewQuantity(mem, resource.DecimalSI) + + if accelerators != nil && len(accelerators) > 0 { + capacity[apiv1.ResourceNvidiaGPU] = *resource.NewQuantity(t.getAcceleratorCount(accelerators), resource.DecimalSI) + } + return capacity, nil } @@ -147,7 +161,8 @@ func (t *templateBuilder) buildNodeFromTemplate(mig *Mig, template *gce.Instance SelfLink: fmt.Sprintf("/api/v1/nodes/%s", nodeName), Labels: map[string]string{}, } - capacity, err := t.buildCapacity(template.Properties.MachineType) + + capacity, err := t.buildCapacity(template.Properties.MachineType, template.Properties.GuestAccelerators) if err != nil { return nil, err } @@ -215,7 +230,8 @@ func (t *templateBuilder) buildNodeFromAutoprovisioningSpec(mig *Mig) (*apiv1.No SelfLink: fmt.Sprintf("/api/v1/nodes/%s", nodeName), Labels: map[string]string{}, } - capacity, err := t.buildCapacity(mig.spec.machineType) + // TODO: Handle GPU + capacity, err := t.buildCapacity(mig.spec.machineType, nil) if err != nil { return nil, err } diff --git a/cluster-autoscaler/cloudprovider/gce/templates_test.go b/cluster-autoscaler/cloudprovider/gce/templates_test.go index e3cf070b9d14..be5fe0bae8d8 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates_test.go +++ b/cluster-autoscaler/cloudprovider/gce/templates_test.go @@ -34,11 +34,13 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { kubeEnv string name string machineType string + accelerators []*gce.AcceleratorConfig mig *Mig capacityCpu string capacityMemory string allocatableCpu string allocatableMemory string + gpuCount int64 expectedErr bool } testCases := []testCase{{ @@ -49,6 +51,10 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { "NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n", name: "nodeName", machineType: "custom-8-2", + accelerators: []*gce.AcceleratorConfig{ + {AcceleratorType: "nvidia-tesla-k80", AcceleratorCount: 3}, + {AcceleratorType: "nvidia-tesla-p100", AcceleratorCount: 8}, + }, mig: &Mig{GceRef: GceRef{ Name: "some-name", Project: "some-proj", @@ -57,6 +63,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { capacityMemory: fmt.Sprintf("%v", 2*1024*1024), allocatableCpu: "7000m", allocatableMemory: fmt.Sprintf("%v", 1024*1024), + gpuCount: 11, expectedErr: false, }, { kubeEnv: "ENABLE_NODE_PROBLEM_DETECTOR: 'daemonset'\n" + @@ -90,6 +97,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { template := &gce.InstanceTemplate{ Name: tc.name, Properties: &gce.InstanceProperties{ + GuestAccelerators: tc.accelerators, Metadata: &gce.Metadata{ Items: []*gce.MetadataItems{{Key: "kube-env", Value: &tc.kubeEnv}}, }, @@ -102,10 +110,10 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { } else { assert.NoError(t, err) podsQuantity, _ := resource.ParseQuantity("110") - capacity, err := makeResourceList(tc.capacityCpu, tc.capacityMemory) + capacity, err := makeResourceList(tc.capacityCpu, tc.capacityMemory, tc.gpuCount) capacity[apiv1.ResourcePods] = podsQuantity assert.NoError(t, err) - allocatable, err := makeResourceList(tc.allocatableCpu, tc.allocatableMemory) + allocatable, err := makeResourceList(tc.allocatableCpu, tc.allocatableMemory, tc.gpuCount) allocatable[apiv1.ResourcePods] = podsQuantity assert.NoError(t, err) assertEqualResourceLists(t, "Capacity", capacity, node.Status.Capacity) @@ -182,6 +190,7 @@ func TestBuildAllocatableFromKubeEnv(t *testing.T) { capacityMemory string expectedCpu string expectedMemory string + gcuCount int64 expectedErr bool } testCases := []testCase{{ @@ -194,6 +203,7 @@ func TestBuildAllocatableFromKubeEnv(t *testing.T) { capacityMemory: "700000Mi", expectedCpu: "3000m", expectedMemory: "400000Mi", + gcuCount: 10, expectedErr: false, }, { kubeEnv: "ENABLE_NODE_PROBLEM_DETECTOR: 'daemonset'\n" + @@ -205,7 +215,7 @@ func TestBuildAllocatableFromKubeEnv(t *testing.T) { expectedErr: true, }} for _, tc := range testCases { - capacity, err := makeResourceList(tc.capacityCpu, tc.capacityMemory) + capacity, err := makeResourceList(tc.capacityCpu, tc.capacityMemory, tc.gcuCount) assert.NoError(t, err) tb := templateBuilder{} allocatable, err := tb.buildAllocatableFromKubeEnv(capacity, tc.kubeEnv) @@ -213,25 +223,59 @@ func TestBuildAllocatableFromKubeEnv(t *testing.T) { assert.Error(t, err) } else { assert.NoError(t, err) - expectedResources, err := makeResourceList(tc.expectedCpu, tc.expectedMemory) + expectedResources, err := makeResourceList(tc.expectedCpu, tc.expectedMemory, tc.gcuCount) assert.NoError(t, err) assert.Equal(t, expectedResources, allocatable) } } } +func TestGetAcceleratorCount(t *testing.T) { + testCases := []struct { + accelerators []*gce.AcceleratorConfig + count int64 + }{{ + accelerators: []*gce.AcceleratorConfig{}, + count: 0, + }, { + accelerators: []*gce.AcceleratorConfig{ + {AcceleratorType: "nvidia-tesla-k80", AcceleratorCount: 3}, + }, + count: 3, + }, { + accelerators: []*gce.AcceleratorConfig{ + {AcceleratorType: "nvidia-tesla-k80", AcceleratorCount: 3}, + {AcceleratorType: "nvidia-tesla-p100", AcceleratorCount: 8}, + }, + count: 11, + }, { + accelerators: []*gce.AcceleratorConfig{ + {AcceleratorType: "other-type", AcceleratorCount: 3}, + {AcceleratorType: "nvidia-tesla-p100", AcceleratorCount: 8}, + }, + count: 8, + }} + + for _, tc := range testCases { + tb := templateBuilder{} + assert.Equal(t, tc.count, tb.getAcceleratorCount(tc.accelerators)) + } +} + func TestBuildAllocatableFromCapacity(t *testing.T) { type testCase struct { capacityCpu string capacityMemory string allocatableCpu string allocatableMemory string + gpuCount int64 } testCases := []testCase{{ capacityCpu: "16000m", capacityMemory: fmt.Sprintf("%v", 1*1024*1024*1024), allocatableCpu: "15890m", allocatableMemory: fmt.Sprintf("%v", 0.75*1024*1024*1024), + gpuCount: 1, }, { capacityCpu: "500m", capacityMemory: fmt.Sprintf("%v", 200*1000*1024*1024), @@ -240,9 +284,9 @@ func TestBuildAllocatableFromCapacity(t *testing.T) { }} for _, tc := range testCases { tb := templateBuilder{} - capacity, err := makeResourceList(tc.capacityCpu, tc.capacityMemory) + capacity, err := makeResourceList(tc.capacityCpu, tc.capacityMemory, tc.gpuCount) assert.NoError(t, err) - expectedAllocatable, err := makeResourceList(tc.allocatableCpu, tc.allocatableMemory) + expectedAllocatable, err := makeResourceList(tc.allocatableCpu, tc.allocatableMemory, tc.gpuCount) assert.NoError(t, err) allocatable := tb.buildAllocatableFromCapacity(capacity) assertEqualResourceLists(t, "Allocatable", expectedAllocatable, allocatable) @@ -364,7 +408,7 @@ func TestParseKubeReserved(t *testing.T) { assert.Nil(t, resources) } else { assert.NoError(t, err) - expectedResources, err := makeResourceList(tc.expectedCpu, tc.expectedMemory) + expectedResources, err := makeResourceList(tc.expectedCpu, tc.expectedMemory, 0) assert.NoError(t, err) assertEqualResourceLists(t, "Resources", expectedResources, resources) } @@ -425,18 +469,25 @@ func makeTaintSet(taints []apiv1.Taint) map[apiv1.Taint]bool { return set } -func makeResourceList(cpu string, memory string) (apiv1.ResourceList, error) { +func makeResourceList(cpu string, memory string, gpu int64) (apiv1.ResourceList, error) { result := apiv1.ResourceList{} resultCpu, err := resource.ParseQuantity(cpu) if err != nil { return nil, err } + result[apiv1.ResourceCPU] = resultCpu resultMemory, err := resource.ParseQuantity(memory) if err != nil { return nil, err } - result[apiv1.ResourceCPU] = resultCpu result[apiv1.ResourceMemory] = resultMemory + if gpu > 0 { + resultGpu := *resource.NewQuantity(gpu, resource.DecimalSI) + if err != nil { + return nil, err + } + result[apiv1.ResourceNvidiaGPU] = resultGpu + } return result, nil }