Merge pull request #4865 from cyphar/1.3-reset-cpu-affinity

[1.3] libct: reset CPU affinity by default
This commit is contained in:
lfbzhm
2025-08-28 12:28:02 +08:00
committed by GitHub
4 changed files with 187 additions and 7 deletions

View File

@@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased 1.3.z]
### Fixed
* Container processes will no longer inherit the CPU affinity of runc by
default. Instead, the default CPU affinity of container processes will be
the largest set of CPUs permitted by the container's cpuset cgroup and any
other system restrictions (such as isolated CPUs). (#4041, #4815, #4858)
## [1.3.0] - 2025-04-30
> Mr. President, we must not allow a mine shaft gap!

View File

@@ -163,6 +163,46 @@ type setnsProcess struct {
initProcessPid int
}
// tryResetCPUAffinity tries to reset the CPU affinity of the process
// identified by pid to include all possible CPUs (notwithstanding cgroup
// cpuset restrictions and isolated CPUs).
func tryResetCPUAffinity(pid int) {
// When resetting the CPU affinity, we want to match the configured cgroup
// cpuset (or the default set of all CPUs, if no cpuset is configured)
// rather than some more restrictive affinity we were spawned in (such as
// one that may have been inherited from systemd). The cpuset cgroup used
// to reconfigure the cpumask automatically for joining processes, but
// kcommit da019032819a ("sched: Enforce user requested affinity") changed
// this behaviour in Linux 6.2.
//
// Parsing cpuset.cpus.effective is quite inefficient (and looking at
// things like /proc/stat would be wrong for most nested containers), but
// luckily sched_setaffinity(2) will implicitly:
//
// * Clamp the cpumask so that it matches the current number of CPUs on
// the system.
// * Mask out any CPUs that are not a member of the target task's
// configured cgroup cpuset.
//
// So we can just pass a very large array of set cpumask bits and the
// kernel will silently convert that to the correct value very cheaply.
// Ideally, we would just set the array to 0xFF...FF. Unfortunately, the
// size depends on the architecture. It is also a private newtype, so we
// can't use (^0) or generics since those require us to be able to name the
// type. However, we can just underflow the zero value instead.
// TODO: Once <https://golang.org/cl/698015> is merged, switch to that.
cpuset := unix.CPUSet{}
for i := range cpuset {
cpuset[i]-- // underflow to 0xFF..FF
}
if err := unix.SchedSetaffinity(pid, &cpuset); err != nil {
logrus.WithError(
os.NewSyscallError("sched_setaffinity", err),
).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid)
}
}
// Starts setns process with specified initial CPU affinity.
func (p *setnsProcess) startWithCPUAffinity() error {
aff := p.config.CPUAffinity
@@ -193,7 +233,13 @@ func (p *setnsProcess) startWithCPUAffinity() error {
func (p *setnsProcess) setFinalCPUAffinity() error {
aff := p.config.CPUAffinity
if aff == nil || aff.Final == nil {
// If there was no affinity configured at all, we want to reset
// the affinity to make sure we don't inherit an unexpected one.
if aff == nil || aff.Final == nil && aff.Initial == nil {
tryResetCPUAffinity(p.pid())
return nil
}
if aff.Final == nil {
return nil
}
if err := unix.SchedSetaffinity(p.pid(), aff.Final); err != nil {
@@ -619,6 +665,9 @@ func (p *initProcess) start() (retErr error) {
return fmt.Errorf("unable to apply cgroup configuration: %w", err)
}
}
// Reset the CPU affinity after cgroups are configured to make sure it
// matches any configured cpuset.
tryResetCPUAffinity(p.pid())
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Apply(p.pid()); err != nil {
return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)

View File

@@ -4,9 +4,14 @@
load helpers
INITIAL_CPU_MASK="$(grep -F Cpus_allowed_list: /proc/self/status | awk '{ print $2 }')"
function setup() {
requires smp cgroups_cpuset
setup_busybox
echo "Initial CPU mask: $INITIAL_CPU_MASK" >&2
echo "---" >&2
}
function teardown() {
@@ -99,3 +104,107 @@ function cpus_to_mask() {
[[ "$output" == *"nsexec"*": affinity: $mask"* ]]
[[ "$output" == *"Cpus_allowed_list: $final"* ]] # Mind the literal tab.
}
@test "runc run [CPU affinity should reset]" {
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
# bash function (which is what runc and __runc are).
setup_runc_cmdline
first="$(first_cpu)"
# Running without cpuset should result in an affinity for all CPUs.
update_config '.process.args = [ "/bin/grep", "-F", "Cpus_allowed_list:", "/proc/self/status" ]'
update_config 'del(.linux.resources.cpu)'
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run ctr
[ "$status" -eq 0 ]
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
[[ "$output" == $'Cpus_allowed_list:\t'"$INITIAL_CPU_MASK" ]]
}
@test "runc run [CPU affinity should reset to cgroup cpuset]" {
[ $EUID -ne 0 ] && requires rootless_cgroup
set_cgroups_path
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
# bash function (which is what runc and __runc are).
setup_runc_cmdline
first="$(first_cpu)"
second="$((first + 1))" # Hacky; might not work in all environments.
# Running with a cpuset should result in an affinity that matches.
update_config '.process.args = [ "/bin/grep", "-F", "Cpus_allowed_list:", "/proc/self/status" ]'
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$first-$second"'"}'
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run ctr
[ "$status" -eq 0 ]
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
# XXX: For some reason, systemd-cgroup leads to us using the all-set
# cpumask rather than the cpuset we configured?
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$first-$second" ]]
# Ditto for a cpuset that has no overlap with the original cpumask.
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$second"'"}'
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run ctr
[ "$status" -eq 0 ]
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
# XXX: For some reason, systemd-cgroup leads to us using the all-set
# cpumask rather than the cpuset we configured?
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$second" ]]
}
@test "runc exec [default CPU affinity should reset]" {
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
# bash function (which is what runc and __runc are).
setup_runc_cmdline
first="$(first_cpu)"
# Running without cpuset should result in an affinity for all CPUs.
update_config '.process.args = [ "/bin/sleep", "infinity" ]'
update_config 'del(.linux.resources.cpu)'
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run -d --console-socket "$CONSOLE_SOCKET" ctr3
[ "$status" -eq 0 ]
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" exec ctr3 grep -F Cpus_allowed_list: /proc/self/status
[ "$status" -eq 0 ]
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
[[ "$output" == $'Cpus_allowed_list:\t'"$INITIAL_CPU_MASK" ]]
}
@test "runc exec [default CPU affinity should reset to cgroup cpuset]" {
[ $EUID -ne 0 ] && requires rootless_cgroup
set_cgroups_path
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
# bash function (which is what runc and __runc are).
setup_runc_cmdline
first="$(first_cpu)"
second="$((first + 1))" # Hacky; might not work in all environments.
# Running with a cpuset should result in an affinity that matches.
update_config '.process.args = [ "/bin/sleep", "infinity" ]'
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$first-$second"'"}'
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run -d --console-socket "$CONSOLE_SOCKET" ctr
[ "$status" -eq 0 ]
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" exec ctr grep -F Cpus_allowed_list: /proc/self/status
[ "$status" -eq 0 ]
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
# XXX: For some reason, systemd-cgroup leads to us using the all-set
# cpumask rather than the cpuset we configured?
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$first-$second" ]]
# Stop the container so we can reconfigure it.
runc delete -f ctr
[ "$status" -eq 0 ]
# Ditto for a cpuset that has no overlap with the original cpumask.
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$second"'"}'
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run -d --console-socket "$CONSOLE_SOCKET" ctr
[ "$status" -eq 0 ]
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" exec ctr grep -F Cpus_allowed_list: /proc/self/status
[ "$status" -eq 0 ]
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
# XXX: For some reason, systemd-cgroup leads to us using the all-set
# cpumask rather than the cpuset we configured?
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$second" ]]
}

View File

@@ -36,22 +36,38 @@ ARCH=$(uname -m)
# Seccomp agent socket.
SECCCOMP_AGENT_SOCKET="$BATS_TMPDIR/seccomp-agent.sock"
# Wrapper for runc.
function runc() {
run __runc "$@"
# Wrapper around "run" that logs output to make tests easier to debug.
function sane_run() {
local cmd="$1"
local cmdname="${CMDNAME:-$(basename "$cmd")}"
shift
run "$cmd" "$@"
# Some debug information to make life easier. bats will only print it if the
# test failed, in which case the output is useful.
# shellcheck disable=SC2154
echo "$(basename "$RUNC") $* (status=$status):" >&2
echo "$cmdname $* (status=$status)" >&2
# shellcheck disable=SC2154
echo "$output" >&2
}
# Wrapper for runc.
function runc() {
CMDNAME="$(basename "$RUNC")" sane_run __runc "$@"
}
function setup_runc_cmdline() {
RUNC_CMDLINE=("$RUNC")
[[ -v RUNC_USE_SYSTEMD ]] && RUNC_CMDLINE+=("--systemd-cgroup")
[[ -n "${ROOT:-}" ]] && RUNC_CMDLINE+=("--root" "$ROOT/state")
export RUNC_CMDLINE
}
# Raw wrapper for runc.
function __runc() {
"$RUNC" ${RUNC_USE_SYSTEMD+--systemd-cgroup} \
${ROOT:+--root "$ROOT/state"} "$@"
setup_runc_cmdline
"${RUNC_CMDLINE[@]}" "$@"
}
# Wrapper for runc spec.