Skip to content

Commit 43d1b00

Browse files
committed
runc exec: implement CPU affinity
As per - opencontainers/runtime-spec#1253 - opencontainers/runtime-spec#1261 CPU affinity can be set in two ways: 1. When creating/starting a container, in config.json's Process.ExecCPUAffinity, which is when applied to all execs. 2. When running an exec, in process.json's CPUAffinity, which applied to a given exec and overrides the value from (1). Add some basic tests. Note that older kernels (RHEL8, Ubuntu 20.04) change CPU affinity of a process to that of a container's cgroup, as soon as it is moved to that cgroup, while newer kernels (Ubuntu 24.04, Fedora 41) don't do that. Because of the above, - it's impossible to really test initial CPU affinity without adding debug logging to libcontainer/nsenter; - for older kernels, there can be a brief moment when exec's affinity is different than either initial or final affinity being set; - exec's final CPU affinity, if not specified, can be different depending on the kernel, therefore we don't test it. Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
1 parent 701516b commit 43d1b00

13 files changed

+381
-5
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2323
methods no longer ignore `Process.IOPriority` and `Process.Scheduler`
2424
settings. (#4585)
2525

26+
### Added
27+
* CPU affinity support for `runc exec`. (#4327)
28+
2629
## [1.2.5] - 2025-02-13
2730

2831
> Мороз и солнце; день чудесный!

libcontainer/configs/config.go

+89
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,13 @@ package configs
33
import (
44
"bytes"
55
"encoding/json"
6+
"errors"
67
"fmt"
78
"os/exec"
9+
"strconv"
10+
"strings"
811
"time"
12+
"unsafe"
913

1014
"github.com/sirupsen/logrus"
1115
"golang.org/x/sys/unix"
@@ -225,6 +229,9 @@ type Config struct {
225229

226230
// IOPriority is the container's I/O priority.
227231
IOPriority *IOPriority `json:"io_priority,omitempty"`
232+
233+
// ExecCPUAffinity is CPU affinity for a non-init process to be run in the container.
234+
ExecCPUAffinity *CPUAffinity `json:"exec_cpu_affinity,omitempty"`
228235
}
229236

230237
// Scheduler is based on the Linux sched_setattr(2) syscall.
@@ -288,6 +295,88 @@ func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) {
288295

289296
type IOPriority = specs.LinuxIOPriority
290297

298+
type CPUAffinity struct {
299+
Initial, Final *unix.CPUSet
300+
}
301+
302+
func toCPUSet(str string) (*unix.CPUSet, error) {
303+
if str == "" {
304+
return nil, nil
305+
}
306+
s := new(unix.CPUSet)
307+
308+
maxCPU := uint64(unsafe.Sizeof(*s) * 8)
309+
toInt := func(v string) (int, error) {
310+
ret, err := strconv.ParseUint(v, 10, 32)
311+
if err != nil {
312+
return 0, err
313+
}
314+
if ret >= maxCPU {
315+
return 0, fmt.Errorf("values larger than %d are not supported", maxCPU-1)
316+
}
317+
return int(ret), nil
318+
}
319+
320+
for _, r := range strings.Split(str, ",") {
321+
// Allow extra spaces around.
322+
r = strings.TrimSpace(r)
323+
// Allow empty elements (extra commas).
324+
if r == "" {
325+
continue
326+
}
327+
if r0, r1, found := strings.Cut(r, "-"); found {
328+
start, err := toInt(r0)
329+
if err != nil {
330+
return nil, err
331+
}
332+
end, err := toInt(r1)
333+
if err != nil {
334+
return nil, err
335+
}
336+
if start > end {
337+
return nil, errors.New("invalid range: " + r)
338+
}
339+
for i := start; i <= end; i++ {
340+
s.Set(i)
341+
}
342+
} else {
343+
val, err := toInt(r)
344+
if err != nil {
345+
return nil, err
346+
}
347+
s.Set(int(val))
348+
}
349+
}
350+
if s.Count() == 0 {
351+
return nil, fmt.Errorf("no CPUs found in %q", str)
352+
}
353+
354+
return s, nil
355+
}
356+
357+
// ConvertCPUAffinity converts [specs.CPUAffinity] to [CPUAffinity].
358+
func ConvertCPUAffinity(sa *specs.CPUAffinity) (*CPUAffinity, error) {
359+
if sa == nil {
360+
return nil, nil
361+
}
362+
initial, err := toCPUSet(sa.Initial)
363+
if err != nil {
364+
return nil, fmt.Errorf("bad CPUAffinity.Initial: %w", err)
365+
}
366+
final, err := toCPUSet(sa.Final)
367+
if err != nil {
368+
return nil, fmt.Errorf("bad CPUAffinity.Final: %w", err)
369+
}
370+
if initial == nil && final == nil {
371+
return nil, nil
372+
}
373+
374+
return &CPUAffinity{
375+
Initial: initial,
376+
Final: final,
377+
}, nil
378+
}
379+
291380
type (
292381
HookName string
293382
HookList []Hook

libcontainer/configs/tocpuset_test.go

+82
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
package configs
2+
3+
import (
4+
"testing"
5+
6+
"golang.org/x/sys/unix"
7+
)
8+
9+
func TestToCPUSet(t *testing.T) {
10+
testCases := []struct {
11+
in string
12+
out *unix.CPUSet
13+
isErr bool
14+
}{
15+
{in: ""}, // Empty means unset.
16+
17+
// Valid cases.
18+
{in: "0", out: &unix.CPUSet{1}},
19+
{in: "1", out: &unix.CPUSet{2}},
20+
{in: "0-1", out: &unix.CPUSet{3}},
21+
{in: "0,1", out: &unix.CPUSet{3}},
22+
{in: ",0,1,", out: &unix.CPUSet{3}},
23+
{in: "0-3", out: &unix.CPUSet{0x0f}},
24+
{in: "0,1,2-3", out: &unix.CPUSet{0x0f}},
25+
{in: "4-7", out: &unix.CPUSet{0xf0}},
26+
{in: "0-7", out: &unix.CPUSet{0xff}},
27+
{in: "0-15", out: &unix.CPUSet{0xffff}},
28+
{in: "16", out: &unix.CPUSet{0x10000}},
29+
{in: "0-3,32-33", out: &unix.CPUSet{0x30000000f}},
30+
{in: "0-63", out: &unix.CPUSet{0xffffffffffffffff}},
31+
{in: "64-127", out: &unix.CPUSet{0, 0xffffffffffffffff}},
32+
// Extra whitespace in between ranges are OK.
33+
{in: "1, 2, 1-2", out: &unix.CPUSet{6}},
34+
{in: " , 1 , 3 , 5-7, ", out: &unix.CPUSet{0xea}},
35+
// Somewhat large values.
36+
{in: "128-130,1", out: &unix.CPUSet{2, 0, 7}},
37+
{in: "1023", out: &unix.CPUSet{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x8000000000000000}},
38+
39+
// Error cases.
40+
{in: "-", isErr: true},
41+
{in: "1-", isErr: true},
42+
{in: "-3", isErr: true},
43+
{in: ",", isErr: true},
44+
{in: " ", isErr: true},
45+
// Bad range (start > end).
46+
{in: "54-53", isErr: true},
47+
// Extra spaces inside a range is not OK.
48+
{in: "1 - 2", isErr: true},
49+
{in: "1024", isErr: true}, // Too big for unix.CPUSet.
50+
}
51+
52+
for _, tc := range testCases {
53+
tc := tc
54+
t.Run(tc.in, func(t *testing.T) {
55+
out, err := toCPUSet(tc.in)
56+
t.Logf("toCPUSet(%q) = %v (error: %v)", tc.in, out, err)
57+
// Check the error.
58+
if tc.isErr {
59+
if err == nil {
60+
t.Error("want error, got nil")
61+
}
62+
return // No more checks.
63+
}
64+
if err != nil {
65+
t.Fatalf("want no error, got %v", err)
66+
}
67+
// Check the value.
68+
if tc.out == nil {
69+
if out != nil {
70+
t.Fatalf("want nil, got %v", out)
71+
}
72+
return // No more checks.
73+
}
74+
if out == nil {
75+
t.Fatalf("want %v, got nil", tc.out)
76+
}
77+
if *out != *tc.out {
78+
t.Errorf("case %q: want %v, got %v", tc.in, tc.out, out)
79+
}
80+
})
81+
}
82+
}

libcontainer/container_linux.go

+4
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,7 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
709709
Rlimits: c.config.Rlimits,
710710
IOPriority: c.config.IOPriority,
711711
Scheduler: c.config.Scheduler,
712+
CPUAffinity: c.config.ExecCPUAffinity,
712713
CreateConsole: process.ConsoleSocket != nil,
713714
ConsoleWidth: process.ConsoleWidth,
714715
ConsoleHeight: process.ConsoleHeight,
@@ -737,6 +738,9 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
737738
if process.Scheduler != nil {
738739
cfg.Scheduler = process.Scheduler
739740
}
741+
if process.CPUAffinity != nil {
742+
cfg.CPUAffinity = process.CPUAffinity
743+
}
740744

741745
// Set misc properties.
742746

libcontainer/init_linux.go

+1
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ type initConfig struct {
8383
Rlimits []configs.Rlimit `json:"rlimits"`
8484
IOPriority *configs.IOPriority `json:"io_priority,omitempty"`
8585
Scheduler *configs.Scheduler `json:"scheduler,omitempty"`
86+
CPUAffinity *configs.CPUAffinity `json:"cpu_affinity,omitempty"`
8687

8788
// Miscellaneous properties, filled in by [Container.newInitConfig]
8889
// unless documented otherwise.

libcontainer/nsenter/log.c

+7-2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ void setup_logpipe(void)
3131
loglevel = i;
3232
}
3333

34+
bool log_enabled_for(int level)
35+
{
36+
return (logfd >= 0 && level <= loglevel);
37+
}
38+
3439
/* Defined in nsexec.c */
3540
extern int current_stage;
3641

@@ -40,8 +45,8 @@ void write_log(int level, const char *format, ...)
4045
va_list args;
4146
int ret;
4247

43-
if (logfd < 0 || level > loglevel)
44-
goto out;
48+
if (!log_enabled_for(level))
49+
return;
4550

4651
va_start(args, format);
4752
ret = vasprintf(&message, format, args);

libcontainer/nsenter/log.h

+3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#ifndef NSENTER_LOG_H
22
#define NSENTER_LOG_H
33

4+
#include <stdbool.h>
45
#include <stdio.h>
56

67
/*
@@ -20,6 +21,8 @@
2021
*/
2122
void setup_logpipe(void);
2223

24+
bool log_enabled_for(int level);
25+
2326
void write_log(int level, const char *format, ...) __attribute__((format(printf, 2, 3)));
2427

2528
extern int logfd;

libcontainer/nsenter/nsexec.c

+32
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,29 @@ static void update_timens_offsets(pid_t pid, char *map, size_t map_len)
673673
bail("failed to update /proc/%d/timens_offsets", pid);
674674
}
675675

676+
677+
static void log_cpu_affinity()
678+
{
679+
cpu_set_t cpus = { };
680+
size_t i, mask = 0;
681+
682+
if (!log_enabled_for(DEBUG))
683+
return;
684+
685+
if (sched_getaffinity(0, sizeof(cpus), &cpus) < 0) {
686+
write_log(WARNING, "sched_getaffinity: %m");
687+
return;
688+
}
689+
690+
/* Do not print the complete mask, we only need a few first CPUs. */
691+
for (i = 0; i < sizeof(mask) * 8; i++) {
692+
if (CPU_ISSET(i, &cpus))
693+
mask |= 1 << i;
694+
}
695+
696+
write_log(DEBUG, "affinity: 0x%zx", mask);
697+
}
698+
676699
void nsexec(void)
677700
{
678701
int pipenum;
@@ -699,6 +722,15 @@ void nsexec(void)
699722

700723
write_log(DEBUG, "=> nsexec container setup");
701724

725+
/* Log initial CPU affinity, this is solely for the tests in
726+
* ../../tests/integration/cpu_affinity.bats.
727+
*
728+
* Logging this from Go code might be too late as some kernels
729+
* change the process' CPU affinity to that of container's cpuset
730+
* as soon as the process is moved into container's cgroup.
731+
*/
732+
log_cpu_affinity();
733+
702734
/* Parse all of the netlink configuration. */
703735
nl_parse(pipenum, &config);
704736

libcontainer/process.go

+2
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ type Process struct {
121121
//
122122
// If not empty, takes precedence over container's [configs.Config.IOPriority].
123123
IOPriority *configs.IOPriority
124+
125+
CPUAffinity *configs.CPUAffinity
124126
}
125127

126128
// Wait waits for the process to exit.

libcontainer/process_linux.go

+46-3
Original file line numberDiff line numberDiff line change
@@ -163,13 +163,52 @@ type setnsProcess struct {
163163
initProcessPid int
164164
}
165165

166+
// Starts setns process with specified initial CPU affinity.
167+
func (p *setnsProcess) startWithCPUAffinity() error {
168+
aff := p.config.CPUAffinity
169+
if aff == nil || aff.Initial == nil {
170+
return p.cmd.Start()
171+
}
172+
errCh := make(chan error)
173+
defer close(errCh)
174+
175+
// Use a goroutine to dedicate an OS thread.
176+
go func() {
177+
runtime.LockOSThread()
178+
// Command inherits the CPU affinity.
179+
if err := unix.SchedSetaffinity(unix.Gettid(), aff.Initial); err != nil {
180+
errCh <- fmt.Errorf("error setting initial CPU affinity: %w", err)
181+
return
182+
}
183+
184+
errCh <- p.cmd.Start()
185+
// Deliberately omit runtime.UnlockOSThread here.
186+
// https://pkg.go.dev/runtime#LockOSThread says:
187+
// "If the calling goroutine exits without unlocking the
188+
// thread, the thread will be terminated".
189+
}()
190+
191+
return <-errCh
192+
}
193+
194+
func (p *setnsProcess) setFinalCPUAffinity() error {
195+
aff := p.config.CPUAffinity
196+
if aff == nil || aff.Final == nil {
197+
return nil
198+
}
199+
if err := unix.SchedSetaffinity(p.pid(), aff.Final); err != nil {
200+
return fmt.Errorf("error setting final CPU affinity: %w", err)
201+
}
202+
return nil
203+
}
204+
166205
func (p *setnsProcess) start() (retErr error) {
167206
defer p.comm.closeParent()
168207

169-
// get the "before" value of oom kill count
208+
// Get the "before" value of oom kill count.
170209
oom, _ := p.manager.OOMKillCount()
171-
err := p.cmd.Start()
172-
// close the child-side of the pipes (controlled by child)
210+
err := p.startWithCPUAffinity()
211+
// Close the child-side of the pipes (controlled by child).
173212
p.comm.closeChild()
174213
if err != nil {
175214
return fmt.Errorf("error starting setns process: %w", err)
@@ -219,6 +258,10 @@ func (p *setnsProcess) start() (retErr error) {
219258
}
220259
}
221260
}
261+
// Set final CPU affinity right after the process is moved into container's cgroup.
262+
if err := p.setFinalCPUAffinity(); err != nil {
263+
return err
264+
}
222265
if p.intelRdtPath != "" {
223266
// if Intel RDT "resource control" filesystem path exists
224267
_, err := os.Stat(p.intelRdtPath)

0 commit comments

Comments
 (0)