mirror of
https://github.com/opencontainers/runc.git
synced 2025-09-26 19:41:35 +08:00
Merge pull request #4865 from cyphar/1.3-reset-cpu-affinity
[1.3] libct: reset CPU affinity by default
This commit is contained in:
@@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## [Unreleased 1.3.z]
|
||||
|
||||
### Fixed
|
||||
* Container processes will no longer inherit the CPU affinity of runc by
|
||||
default. Instead, the default CPU affinity of container processes will be
|
||||
the largest set of CPUs permitted by the container's cpuset cgroup and any
|
||||
other system restrictions (such as isolated CPUs). (#4041, #4815, #4858)
|
||||
|
||||
## [1.3.0] - 2025-04-30
|
||||
|
||||
> Mr. President, we must not allow a mine shaft gap!
|
||||
|
@@ -163,6 +163,46 @@ type setnsProcess struct {
|
||||
initProcessPid int
|
||||
}
|
||||
|
||||
// tryResetCPUAffinity tries to reset the CPU affinity of the process
|
||||
// identified by pid to include all possible CPUs (notwithstanding cgroup
|
||||
// cpuset restrictions and isolated CPUs).
|
||||
func tryResetCPUAffinity(pid int) {
|
||||
// When resetting the CPU affinity, we want to match the configured cgroup
|
||||
// cpuset (or the default set of all CPUs, if no cpuset is configured)
|
||||
// rather than some more restrictive affinity we were spawned in (such as
|
||||
// one that may have been inherited from systemd). The cpuset cgroup used
|
||||
// to reconfigure the cpumask automatically for joining processes, but
|
||||
// kcommit da019032819a ("sched: Enforce user requested affinity") changed
|
||||
// this behaviour in Linux 6.2.
|
||||
//
|
||||
// Parsing cpuset.cpus.effective is quite inefficient (and looking at
|
||||
// things like /proc/stat would be wrong for most nested containers), but
|
||||
// luckily sched_setaffinity(2) will implicitly:
|
||||
//
|
||||
// * Clamp the cpumask so that it matches the current number of CPUs on
|
||||
// the system.
|
||||
// * Mask out any CPUs that are not a member of the target task's
|
||||
// configured cgroup cpuset.
|
||||
//
|
||||
// So we can just pass a very large array of set cpumask bits and the
|
||||
// kernel will silently convert that to the correct value very cheaply.
|
||||
|
||||
// Ideally, we would just set the array to 0xFF...FF. Unfortunately, the
|
||||
// size depends on the architecture. It is also a private newtype, so we
|
||||
// can't use (^0) or generics since those require us to be able to name the
|
||||
// type. However, we can just underflow the zero value instead.
|
||||
// TODO: Once <https://golang.org/cl/698015> is merged, switch to that.
|
||||
cpuset := unix.CPUSet{}
|
||||
for i := range cpuset {
|
||||
cpuset[i]-- // underflow to 0xFF..FF
|
||||
}
|
||||
if err := unix.SchedSetaffinity(pid, &cpuset); err != nil {
|
||||
logrus.WithError(
|
||||
os.NewSyscallError("sched_setaffinity", err),
|
||||
).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid)
|
||||
}
|
||||
}
|
||||
|
||||
// Starts setns process with specified initial CPU affinity.
|
||||
func (p *setnsProcess) startWithCPUAffinity() error {
|
||||
aff := p.config.CPUAffinity
|
||||
@@ -193,7 +233,13 @@ func (p *setnsProcess) startWithCPUAffinity() error {
|
||||
|
||||
func (p *setnsProcess) setFinalCPUAffinity() error {
|
||||
aff := p.config.CPUAffinity
|
||||
if aff == nil || aff.Final == nil {
|
||||
// If there was no affinity configured at all, we want to reset
|
||||
// the affinity to make sure we don't inherit an unexpected one.
|
||||
if aff == nil || aff.Final == nil && aff.Initial == nil {
|
||||
tryResetCPUAffinity(p.pid())
|
||||
return nil
|
||||
}
|
||||
if aff.Final == nil {
|
||||
return nil
|
||||
}
|
||||
if err := unix.SchedSetaffinity(p.pid(), aff.Final); err != nil {
|
||||
@@ -619,6 +665,9 @@ func (p *initProcess) start() (retErr error) {
|
||||
return fmt.Errorf("unable to apply cgroup configuration: %w", err)
|
||||
}
|
||||
}
|
||||
// Reset the CPU affinity after cgroups are configured to make sure it
|
||||
// matches any configured cpuset.
|
||||
tryResetCPUAffinity(p.pid())
|
||||
if p.intelRdtManager != nil {
|
||||
if err := p.intelRdtManager.Apply(p.pid()); err != nil {
|
||||
return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)
|
||||
|
@@ -4,9 +4,14 @@
|
||||
|
||||
load helpers
|
||||
|
||||
INITIAL_CPU_MASK="$(grep -F Cpus_allowed_list: /proc/self/status | awk '{ print $2 }')"
|
||||
|
||||
function setup() {
|
||||
requires smp cgroups_cpuset
|
||||
setup_busybox
|
||||
|
||||
echo "Initial CPU mask: $INITIAL_CPU_MASK" >&2
|
||||
echo "---" >&2
|
||||
}
|
||||
|
||||
function teardown() {
|
||||
@@ -99,3 +104,107 @@ function cpus_to_mask() {
|
||||
[[ "$output" == *"nsexec"*": affinity: $mask"* ]]
|
||||
[[ "$output" == *"Cpus_allowed_list: $final"* ]] # Mind the literal tab.
|
||||
}
|
||||
|
||||
@test "runc run [CPU affinity should reset]" {
|
||||
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
|
||||
# bash function (which is what runc and __runc are).
|
||||
setup_runc_cmdline
|
||||
|
||||
first="$(first_cpu)"
|
||||
|
||||
# Running without cpuset should result in an affinity for all CPUs.
|
||||
update_config '.process.args = [ "/bin/grep", "-F", "Cpus_allowed_list:", "/proc/self/status" ]'
|
||||
update_config 'del(.linux.resources.cpu)'
|
||||
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run ctr
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
|
||||
[[ "$output" == $'Cpus_allowed_list:\t'"$INITIAL_CPU_MASK" ]]
|
||||
}
|
||||
|
||||
@test "runc run [CPU affinity should reset to cgroup cpuset]" {
|
||||
[ $EUID -ne 0 ] && requires rootless_cgroup
|
||||
set_cgroups_path
|
||||
|
||||
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
|
||||
# bash function (which is what runc and __runc are).
|
||||
setup_runc_cmdline
|
||||
|
||||
first="$(first_cpu)"
|
||||
second="$((first + 1))" # Hacky; might not work in all environments.
|
||||
|
||||
# Running with a cpuset should result in an affinity that matches.
|
||||
update_config '.process.args = [ "/bin/grep", "-F", "Cpus_allowed_list:", "/proc/self/status" ]'
|
||||
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$first-$second"'"}'
|
||||
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run ctr
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
|
||||
# XXX: For some reason, systemd-cgroup leads to us using the all-set
|
||||
# cpumask rather than the cpuset we configured?
|
||||
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$first-$second" ]]
|
||||
|
||||
# Ditto for a cpuset that has no overlap with the original cpumask.
|
||||
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$second"'"}'
|
||||
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run ctr
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
|
||||
# XXX: For some reason, systemd-cgroup leads to us using the all-set
|
||||
# cpumask rather than the cpuset we configured?
|
||||
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$second" ]]
|
||||
}
|
||||
|
||||
@test "runc exec [default CPU affinity should reset]" {
|
||||
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
|
||||
# bash function (which is what runc and __runc are).
|
||||
setup_runc_cmdline
|
||||
|
||||
first="$(first_cpu)"
|
||||
|
||||
# Running without cpuset should result in an affinity for all CPUs.
|
||||
update_config '.process.args = [ "/bin/sleep", "infinity" ]'
|
||||
update_config 'del(.linux.resources.cpu)'
|
||||
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run -d --console-socket "$CONSOLE_SOCKET" ctr3
|
||||
[ "$status" -eq 0 ]
|
||||
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" exec ctr3 grep -F Cpus_allowed_list: /proc/self/status
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
|
||||
[[ "$output" == $'Cpus_allowed_list:\t'"$INITIAL_CPU_MASK" ]]
|
||||
}
|
||||
|
||||
@test "runc exec [default CPU affinity should reset to cgroup cpuset]" {
|
||||
[ $EUID -ne 0 ] && requires rootless_cgroup
|
||||
set_cgroups_path
|
||||
|
||||
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
|
||||
# bash function (which is what runc and __runc are).
|
||||
setup_runc_cmdline
|
||||
|
||||
first="$(first_cpu)"
|
||||
second="$((first + 1))" # Hacky; might not work in all environments.
|
||||
|
||||
# Running with a cpuset should result in an affinity that matches.
|
||||
update_config '.process.args = [ "/bin/sleep", "infinity" ]'
|
||||
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$first-$second"'"}'
|
||||
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run -d --console-socket "$CONSOLE_SOCKET" ctr
|
||||
[ "$status" -eq 0 ]
|
||||
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" exec ctr grep -F Cpus_allowed_list: /proc/self/status
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
|
||||
# XXX: For some reason, systemd-cgroup leads to us using the all-set
|
||||
# cpumask rather than the cpuset we configured?
|
||||
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$first-$second" ]]
|
||||
|
||||
# Stop the container so we can reconfigure it.
|
||||
runc delete -f ctr
|
||||
[ "$status" -eq 0 ]
|
||||
|
||||
# Ditto for a cpuset that has no overlap with the original cpumask.
|
||||
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$second"'"}'
|
||||
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run -d --console-socket "$CONSOLE_SOCKET" ctr
|
||||
[ "$status" -eq 0 ]
|
||||
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" exec ctr grep -F Cpus_allowed_list: /proc/self/status
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
|
||||
# XXX: For some reason, systemd-cgroup leads to us using the all-set
|
||||
# cpumask rather than the cpuset we configured?
|
||||
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$second" ]]
|
||||
}
|
||||
|
@@ -36,22 +36,38 @@ ARCH=$(uname -m)
|
||||
# Seccomp agent socket.
|
||||
SECCCOMP_AGENT_SOCKET="$BATS_TMPDIR/seccomp-agent.sock"
|
||||
|
||||
# Wrapper for runc.
|
||||
function runc() {
|
||||
run __runc "$@"
|
||||
# Wrapper around "run" that logs output to make tests easier to debug.
|
||||
function sane_run() {
|
||||
local cmd="$1"
|
||||
local cmdname="${CMDNAME:-$(basename "$cmd")}"
|
||||
shift
|
||||
|
||||
run "$cmd" "$@"
|
||||
|
||||
# Some debug information to make life easier. bats will only print it if the
|
||||
# test failed, in which case the output is useful.
|
||||
# shellcheck disable=SC2154
|
||||
echo "$(basename "$RUNC") $* (status=$status):" >&2
|
||||
echo "$cmdname $* (status=$status)" >&2
|
||||
# shellcheck disable=SC2154
|
||||
echo "$output" >&2
|
||||
}
|
||||
|
||||
# Wrapper for runc.
|
||||
function runc() {
|
||||
CMDNAME="$(basename "$RUNC")" sane_run __runc "$@"
|
||||
}
|
||||
|
||||
function setup_runc_cmdline() {
|
||||
RUNC_CMDLINE=("$RUNC")
|
||||
[[ -v RUNC_USE_SYSTEMD ]] && RUNC_CMDLINE+=("--systemd-cgroup")
|
||||
[[ -n "${ROOT:-}" ]] && RUNC_CMDLINE+=("--root" "$ROOT/state")
|
||||
export RUNC_CMDLINE
|
||||
}
|
||||
|
||||
# Raw wrapper for runc.
|
||||
function __runc() {
|
||||
"$RUNC" ${RUNC_USE_SYSTEMD+--systemd-cgroup} \
|
||||
${ROOT:+--root "$ROOT/state"} "$@"
|
||||
setup_runc_cmdline
|
||||
"${RUNC_CMDLINE[@]}" "$@"
|
||||
}
|
||||
|
||||
# Wrapper for runc spec.
|
||||
|
Reference in New Issue
Block a user