runc exec: implement CPU affinity

kolyshkin · kolyshkin · commit 10ca66bff51f · 2025-03-02T19:17:41.000-08:00
As per - opencontainers/runtime-spec#1253 - opencontainers/runtime-spec#1261 CPU affinity can be set in two ways: 1. When creating/starting a container, in config.json's Process.ExecCPUAffinity, which is when applied to all execs. 2. When running an exec, in process.json's CPUAffinity, which applied to a given exec and overrides the value from (1). Add some basic tests. Note that older kernels (RHEL8, Ubuntu 20.04) change CPU affinity of a process to that of a container's cgroup, as soon as it is moved to that cgroup, while newer kernels (Ubuntu 24.04, Fedora 41) don't do that. Because of the above, - it's impossible to really test initial CPU affinity without adding debug logging to libcontainer/nsenter; - for older kernels, there can be a brief moment when exec's affinity is different than either initial or final affinity being set; - exec's final CPU affinity, if not specified, can be different depending on the kernel, therefore we don't test it. Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
    methods no longer ignore `Process.IOPriority` and `Process.Scheduler`
    settings. (#4585)
 
+### Added
+ * CPU affinity support for `runc exec`. (#4327)
+
 ## [1.2.5] - 2025-02-13
 
 > Мороз и солнце; день чудесный!
diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go
@@ -3,9 +3,13 @@ package configs
 import (
 	"bytes"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"os/exec"
+	"strconv"
+	"strings"
 	"time"
+	"unsafe"
 
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
@@ -225,6 +229,9 @@ type Config struct {
 
 	// IOPriority is the container's I/O priority.
 	IOPriority *IOPriority `json:"io_priority,omitempty"`
+
+	// ExecCPUAffinity is CPU affinity for a non-init process to be run in the container.
+	ExecCPUAffinity *CPUAffinity `json:"exec_cpu_affinity,omitempty"`
 }
 
 // Scheduler is based on the Linux sched_setattr(2) syscall.
@@ -288,6 +295,90 @@ func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) {
 
 type IOPriority = specs.LinuxIOPriority
 
+type CPUAffinity struct {
+	Initial, Final *unix.CPUSet
+}
+
+func toCPUSet(str string) (*unix.CPUSet, error) {
+	if str == "" {
+		return nil, nil
+	}
+	s := new(unix.CPUSet)
+
+	// Since (*CPUset).Set silently ignores too high CPU values,
+	// find out what the maximum is, and return an error.
+	maxCPU := uint64(unsafe.Sizeof(*s) * 8)
+	toInt := func(v string) (int, error) {
+		ret, err := strconv.ParseUint(v, 10, 32)
+		if err != nil {
+			return 0, err
+		}
+		if ret >= maxCPU {
+			return 0, fmt.Errorf("values larger than %d are not supported", maxCPU-1)
+		}
+		return int(ret), nil
+	}
+
+	for _, r := range strings.Split(str, ",") {
+		// Allow extra spaces around.
+		r = strings.TrimSpace(r)
+		// Allow empty elements (extra commas).
+		if r == "" {
+			continue
+		}
+		if r0, r1, found := strings.Cut(r, "-"); found {
+			start, err := toInt(r0)
+			if err != nil {
+				return nil, err
+			}
+			end, err := toInt(r1)
+			if err != nil {
+				return nil, err
+			}
+			if start > end {
+				return nil, errors.New("invalid range: " + r)
+			}
+			for i := start; i <= end; i++ {
+				s.Set(i)
+			}
+		} else {
+			val, err := toInt(r)
+			if err != nil {
+				return nil, err
+			}
+			s.Set(val)
+		}
+	}
+	if s.Count() == 0 {
+		return nil, fmt.Errorf("no CPUs found in %q", str)
+	}
+
+	return s, nil
+}
+
+// ConvertCPUAffinity converts [specs.CPUAffinity] to [CPUAffinity].
+func ConvertCPUAffinity(sa *specs.CPUAffinity) (*CPUAffinity, error) {
+	if sa == nil {
+		return nil, nil
+	}
+	initial, err := toCPUSet(sa.Initial)
+	if err != nil {
+		return nil, fmt.Errorf("bad CPUAffinity.Initial: %w", err)
+	}
+	final, err := toCPUSet(sa.Final)
+	if err != nil {
+		return nil, fmt.Errorf("bad CPUAffinity.Final: %w", err)
+	}
+	if initial == nil && final == nil {
+		return nil, nil
+	}
+
+	return &CPUAffinity{
+		Initial: initial,
+		Final:   final,
+	}, nil
+}
+
 type (
 	HookName string
 	HookList []Hook
diff --git a/libcontainer/configs/tocpuset_test.go b/libcontainer/configs/tocpuset_test.go
@@ -0,0 +1,89 @@
+package configs
+
+import (
+	"testing"
+
+	"golang.org/x/sys/unix"
+)
+
+func TestToCPUSet(t *testing.T) {
+	set := func(cpus ...int) *unix.CPUSet {
+		r := &unix.CPUSet{}
+		for _, cpu := range cpus {
+			r.Set(cpu)
+		}
+		return r
+	}
+
+	testCases := []struct {
+		in    string
+		out   *unix.CPUSet
+		isErr bool
+	}{
+		{in: ""}, // Empty means unset.
+
+		// Valid cases.
+		{in: "0", out: &unix.CPUSet{1}},
+		{in: "1", out: &unix.CPUSet{2}},
+		{in: "0-1", out: &unix.CPUSet{3}},
+		{in: "0,1", out: &unix.CPUSet{3}},
+		{in: ",0,1,", out: &unix.CPUSet{3}},
+		{in: "0-3", out: &unix.CPUSet{0x0f}},
+		{in: "0,1,2-3", out: &unix.CPUSet{0x0f}},
+		{in: "4-7", out: &unix.CPUSet{0xf0}},
+		{in: "0-7", out: &unix.CPUSet{0xff}},
+		{in: "0-15", out: &unix.CPUSet{0xffff}},
+		{in: "16", out: &unix.CPUSet{0x10000}},
+		// Extra whitespace in between ranges are OK.
+		{in: "1, 2, 1-2", out: &unix.CPUSet{6}},
+		{in: "    , 1   , 3  ,  5-7,    ", out: &unix.CPUSet{0xea}},
+		// Somewhat large values. The underlying type in unix.CPUSet
+		// can either be uint32 or uint64, so we have to use a helper.
+		{in: "0-3,32-33", out: set(0, 1, 2, 3, 32, 33)},
+		{in: "127-129, 1", out: set(1, 127, 128, 129)},
+		{in: "1023", out: set(1023)},
+
+		// Error cases.
+		{in: "-", isErr: true},
+		{in: "1-", isErr: true},
+		{in: "-3", isErr: true},
+		{in: ",", isErr: true},
+		{in: " ", isErr: true},
+		// Bad range (start > end).
+		{in: "54-53", isErr: true},
+		// Extra spaces inside a range is not OK.
+		{in: "1 - 2", isErr: true},
+		{in: "1024", isErr: true}, // Too big for unix.CPUSet.
+	}
+
+	for _, tc := range testCases {
+		tc := tc
+		t.Run(tc.in, func(t *testing.T) {
+			out, err := toCPUSet(tc.in)
+			t.Logf("toCPUSet(%q) = %v (error: %v)", tc.in, out, err)
+			// Check the error.
+			if tc.isErr {
+				if err == nil {
+					t.Error("want error, got nil")
+				}
+				return // No more checks.
+			}
+			if err != nil {
+				t.Fatalf("want no error, got %v", err)
+			}
+			// Check the value.
+			if tc.out == nil {
+				if out != nil {
+					t.Fatalf("want nil, got %v", out)
+				}
+				return // No more checks.
+			}
+			if out == nil {
+				t.Fatalf("want %v, got nil", tc.out)
+			}
+			if *out != *tc.out {
+				t.Errorf("case %q: want %v, got %v", tc.in, tc.out, out)
+			}
+		})
+	}
+}
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
@@ -709,6 +709,7 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
 		Rlimits:          c.config.Rlimits,
 		IOPriority:       c.config.IOPriority,
 		Scheduler:        c.config.Scheduler,
+		CPUAffinity:      c.config.ExecCPUAffinity,
 		CreateConsole:    process.ConsoleSocket != nil,
 		ConsoleWidth:     process.ConsoleWidth,
 		ConsoleHeight:    process.ConsoleHeight,
@@ -737,6 +738,9 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
 	if process.Scheduler != nil {
 		cfg.Scheduler = process.Scheduler
 	}
+	if process.CPUAffinity != nil {
+		cfg.CPUAffinity = process.CPUAffinity
+	}
 
 	// Set misc properties.
 
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
@@ -83,6 +83,7 @@ type initConfig struct {
 	Rlimits         []configs.Rlimit      `json:"rlimits"`
 	IOPriority      *configs.IOPriority   `json:"io_priority,omitempty"`
 	Scheduler       *configs.Scheduler    `json:"scheduler,omitempty"`
+	CPUAffinity     *configs.CPUAffinity  `json:"cpu_affinity,omitempty"`
 
 	// Miscellaneous properties, filled in by [Container.newInitConfig]
 	// unless documented otherwise.
diff --git a/libcontainer/nsenter/log.c b/libcontainer/nsenter/log.c
@@ -31,6 +31,11 @@ void setup_logpipe(void)
 	loglevel = i;
 }
 
+bool log_enabled_for(int level)
+{
+	return (logfd >= 0 && level <= loglevel);
+}
+
 /* Defined in nsexec.c */
 extern int current_stage;
 
@@ -40,8 +45,8 @@ void write_log(int level, const char *format, ...)
 	va_list args;
 	int ret;
 
-	if (logfd < 0 || level > loglevel)
-		goto out;
+	if (!log_enabled_for(level))
+		return;
 
 	va_start(args, format);
 	ret = vasprintf(&message, format, args);
diff --git a/libcontainer/nsenter/log.h b/libcontainer/nsenter/log.h
@@ -1,6 +1,7 @@
 #ifndef NSENTER_LOG_H
 #define NSENTER_LOG_H
 
+#include <stdbool.h>
 #include <stdio.h>
 
 /*
@@ -20,6 +21,8 @@
  */
 void setup_logpipe(void);
 
+bool log_enabled_for(int level);
+
 void write_log(int level, const char *format, ...) __attribute__((format(printf, 2, 3)));
 
 extern int logfd;
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
@@ -673,6 +673,28 @@ static void update_timens_offsets(pid_t pid, char *map, size_t map_len)
 		bail("failed to update /proc/%d/timens_offsets", pid);
 }
 
+static void log_cpu_affinity()
+{
+	cpu_set_t cpus = { };
+	size_t i, mask = 0;
+
+	if (!log_enabled_for(DEBUG))
+		return;
+
+	if (sched_getaffinity(0, sizeof(cpus), &cpus) < 0) {
+		write_log(WARNING, "sched_getaffinity: %m");
+		return;
+	}
+
+	/* Do not print the complete mask, we only need a few first CPUs. */
+	for (i = 0; i < sizeof(mask) * 8; i++) {
+		if (CPU_ISSET(i, &cpus))
+			mask |= 1 << i;
+	}
+
+	write_log(DEBUG, "affinity: 0x%zx", mask);
+}
+
 void nsexec(void)
 {
 	int pipenum;
@@ -699,6 +721,15 @@ void nsexec(void)
 
 	write_log(DEBUG, "=> nsexec container setup");
 
+	/* Log initial CPU affinity, this is solely for the tests in
+	 * ../../tests/integration/cpu_affinity.bats.
+	 *
+	 * Logging this from Go code might be too late as some kernels
+	 * change the process' CPU affinity to that of container's cpuset
+	 * as soon as the process is moved into container's cgroup.
+	 */
+	log_cpu_affinity();
+
 	/* Parse all of the netlink configuration. */
 	nl_parse(pipenum, &config);
 
diff --git a/libcontainer/process.go b/libcontainer/process.go
@@ -121,6 +121,8 @@ type Process struct {
 	//
 	// If not empty, takes precedence over container's [configs.Config.IOPriority].
 	IOPriority *configs.IOPriority
+
+	CPUAffinity *configs.CPUAffinity
 }
 
 // Wait waits for the process to exit.
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go
diff --git a/tests/integration/cpu_affinity.bats b/tests/integration/cpu_affinity.bats
diff --git a/utils_linux.go b/utils_linux.go

Original file line number	Diff line number	Diff line change
`@@ -121,6 +121,8 @@ type Process struct {`
`121`	`121`	`//`
`122`	`122`	`// If not empty, takes precedence over container's [configs.Config.IOPriority].`
`123`	`123`	`IOPriority *configs.IOPriority`
	`124`	`+`
	`125`	`+ CPUAffinity *configs.CPUAffinity`
`124`	`126`	`}`
`125`	`127`
`126`	`128`	`// Wait waits for the process to exit.`