Skip to content

Commit

Permalink
fix: audit and fix cgroup reservations
Browse files Browse the repository at this point in the history
Fixes: #7081

Review all reservations and limits set, test under stress load (using
both memory and CPU).

The goal: system components (Talos itself) and runtime (kubelet, CRI)
should survive under extreme resource starvation (workloads consuming
all CPU/memory).

Uses #9337 to visualize changes, but doesn't depend on it.

Signed-off-by: Andrey Smirnov <[email protected]>
  • Loading branch information
smira committed Sep 19, 2024
1 parent 806b6aa commit ed43698
Show file tree
Hide file tree
Showing 9 changed files with 136 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,9 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
Min: pointer.To[int64](constants.CgroupInitReservedMemory),
Low: pointer.To[int64](constants.CgroupInitReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](constants.CgroupInitCPUWeight),
},
},
},
{
Expand All @@ -191,15 +194,36 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
Min: pointer.To[int64](constants.CgroupSystemReservedMemory),
Low: pointer.To[int64](constants.CgroupSystemReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](constants.CgroupSystemCPUWeight),
},
},
},
{
name: constants.CgroupSystemRuntime,
resources: &cgroup2.Resources{},
name: constants.CgroupSystemRuntime,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupSystemRuntimeReservedMemory),
Low: pointer.To[int64](constants.CgroupSystemRuntimeReservedMemory * 2),
},
},
},
{
name: constants.CgroupUdevd,
resources: &cgroup2.Resources{},
name: constants.CgroupUdevd,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupUdevdReservedMemory),
Low: pointer.To[int64](constants.CgroupUdevdReservedMemory * 2),
},
},
},
{
name: constants.CgroupPodRuntimeRoot,
resources: &cgroup2.Resources{
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](constants.CgroupPodRuntimeRootCPUWeight),
},
},
},
{
name: constants.CgroupPodRuntime,
Expand All @@ -208,6 +232,9 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
Min: pointer.To[int64](constants.CgroupPodRuntimeReservedMemory),
Low: pointer.To[int64](constants.CgroupPodRuntimeReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](constants.CgroupPodRuntimeCPUWeight),
},
},
},
{
Expand All @@ -217,14 +244,39 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri
Min: pointer.To[int64](constants.CgroupKubeletReservedMemory),
Low: pointer.To[int64](constants.CgroupKubeletReservedMemory * 2),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](constants.CgroupKubeletCPUWeight),
},
},
},
{
name: constants.CgroupDashboard,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupDashboardReservedMemory),
Low: pointer.To[int64](constants.CgroupDashboardLowMemory),
Max: pointer.To[int64](constants.CgroupDashboardMaxMemory),
},
CPU: &cgroup2.CPU{
Weight: pointer.To[uint64](constants.CgroupDashboardCPUWeight),
},
},
},
{
name: constants.CgroupApid,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupApidReservedMemory),
Low: pointer.To[int64](constants.CgroupApidReservedMemory * 2),
Max: pointer.To[int64](constants.CgroupApidMaxMemory),
},
},
},
{
name: constants.CgroupTrustd,
resources: &cgroup2.Resources{
Memory: &cgroup2.Memory{
Min: pointer.To[int64](constants.CgroupTrustdReservedMemory),
Low: pointer.To[int64](constants.CgroupTrustdReservedMemory * 2),
Max: pointer.To[int64](constants.CgroupTrustdMaxMemory),
},
},
},
Expand Down
12 changes: 0 additions & 12 deletions internal/app/machined/pkg/system/runner/containerd/opts.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,6 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
)

// WithMemoryLimit sets the linux resource memory limit field.
func WithMemoryLimit(limit int64) oci.SpecOpts {
return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error {
s.Linux.Resources.Memory = &specs.LinuxMemory{
Limit: &limit,
// DisableOOMKiller: &disable,
}

return nil
}
}

// WithRootfsPropagation sets the root filesystem propagation.
func WithRootfsPropagation(rp string) oci.SpecOpts {
return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error {
Expand Down
20 changes: 20 additions & 0 deletions internal/app/machined/pkg/system/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,18 @@
package runner

import (
"context"
"fmt"
"io"
"time"

containerd "github.com/containerd/containerd/v2/client"
ocicontainers "github.com/containerd/containerd/v2/core/containers"
"github.com/containerd/containerd/v2/pkg/oci"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/siderolabs/gen/maps"
"github.com/siderolabs/gen/optional"
"github.com/siderolabs/go-pointer"

"github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/logging"
Expand Down Expand Up @@ -220,3 +223,20 @@ func WithUID(uid uint32) Option {
args.UID = uid
}
}

// WithMemoryReservation sets the memory reservation limit as on OCI spec.
func WithMemoryReservation(limit uint64) oci.SpecOpts {
return func(_ context.Context, _ oci.Client, _ *ocicontainers.Container, s *oci.Spec) error {
if s.Linux.Resources == nil {
s.Linux.Resources = &specs.LinuxResources{}
}

if s.Linux.Resources.Memory == nil {
s.Linux.Resources.Memory = &specs.LinuxMemory{}
}

s.Linux.Resources.Memory.Reservation = pointer.To(int64(limit))

return nil
}
}
2 changes: 2 additions & 0 deletions internal/app/machined/pkg/system/services/apid.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"net"
"os"
"path/filepath"
"strconv"
"strings"

"github.com/containerd/containerd/v2/pkg/cap"
Expand Down Expand Up @@ -164,6 +165,7 @@ func (o *APID) Runner(r runtime.Runtime) (runner.Runner, error) {

env := []string{
constants.TcellMinimizeEnvironment,
"GOMEMLIMIT=" + strconv.Itoa(constants.CgroupApidMaxMemory/5*4),
}

for _, value := range environment.Get(r.Config()) {
Expand Down
2 changes: 2 additions & 0 deletions internal/app/machined/pkg/system/services/dashboard.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package services
import (
"context"
"fmt"
"strconv"

"github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
"github.com/siderolabs/talos/internal/app/machined/pkg/system/events"
Expand Down Expand Up @@ -61,6 +62,7 @@ func (d *Dashboard) Runner(r runtime.Runtime) (runner.Runner, error) {
runner.WithEnv([]string{
"TERM=linux",
constants.TcellMinimizeEnvironment,
"GOMEMLIMIT=" + strconv.Itoa(constants.CgroupDashboardMaxMemory/5*4),
}),
runner.WithStdinFile(tty),
runner.WithStdoutFile(tty),
Expand Down
1 change: 1 addition & 0 deletions internal/app/machined/pkg/system/services/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ func (e *Etcd) Runner(r runtime.Runtime) (runner.Runner, error) {
oci.WithHostNamespace(specs.NetworkNamespace),
oci.WithMounts(mounts),
oci.WithUser(fmt.Sprintf("%d:%d", constants.EtcdUserID, constants.EtcdUserID)),
runner.WithMemoryReservation(constants.CgroupEtcdReservedMemory),
),
runner.WithOOMScoreAdj(-998),
),
Expand Down
7 changes: 5 additions & 2 deletions internal/app/machined/pkg/system/services/trustd.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"net"
"os"
"path/filepath"
"strconv"

"github.com/containerd/containerd/v2/pkg/cap"
"github.com/containerd/containerd/v2/pkg/oci"
Expand Down Expand Up @@ -142,7 +143,10 @@ func (t *Trustd) Runner(r runtime.Runtime) (runner.Runner, error) {
}

env := environment.Get(r.Config())
env = append(env, constants.TcellMinimizeEnvironment)
env = append(env,
constants.TcellMinimizeEnvironment,
"GOMEMLIMIT="+strconv.Itoa(constants.CgroupTrustdMaxMemory/5*4),
)

if debug.RaceEnabled {
env = append(env, "GORACE=halt_on_error=1")
Expand All @@ -156,7 +160,6 @@ func (t *Trustd) Runner(r runtime.Runtime) (runner.Runner, error) {
runner.WithEnv(env),
runner.WithCgroupPath(constants.CgroupTrustd),
runner.WithOCISpecOpts(
containerd.WithMemoryLimit(int64(1000000*512)),
oci.WithDroppedCapabilities(cap.Known()),
oci.WithHostNamespace(specs.NetworkNamespace),
oci.WithMounts(mounts),
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/mount/cgroups.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func CGroupMountPoints() (mountpoints *Points, err error) {
func cgroupMountPointsV2() (mountpoints *Points, err error) {
cgroups := NewMountPoints()

cgroups.Set("cgroup2", NewMountPoint("cgroup", constants.CgroupMountPath, "cgroup2", unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_RELATIME, "nsdelegate"))
cgroups.Set("cgroup2", NewMountPoint("cgroup", constants.CgroupMountPath, "cgroup2", unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_RELATIME, "nsdelegate,memory_recursiveprot"))

return cgroups, nil
}
Expand Down
55 changes: 47 additions & 8 deletions pkg/machinery/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -672,50 +672,89 @@ const (
// CgroupInitReservedMemory is the hard memory protection for the init process.
CgroupInitReservedMemory = 96 * 1024 * 1024

// CgroupInitCPUWeight is the CPU weight for the init process.
CgroupInitCPUWeight = 300

// CgroupSystem is the cgroup name for system processes.
CgroupSystem = "/system"

// CgroupSystemCPUWeight is the CPU weight for the system cgroup.
CgroupSystemCPUWeight = 150

// CgroupSystemReservedMemory is the hard memory protection for the system processes.
CgroupSystemReservedMemory = 96 * 1024 * 1024

// CgroupSystemRuntime is the cgroup name for containerd runtime processes.
CgroupSystemRuntime = CgroupSystem + "/runtime"

// CgroupSystemRuntimeReservedMemory is the hard memory protection for the system containerd process.
CgroupSystemRuntimeReservedMemory = 48 * 1024 * 1024

// CgroupApid is the cgroup name for apid runtime processes.
CgroupApid = CgroupSystem + "/apid"

// CgroupApidReservedMemory is the hard memory protection for the apid processes.
CgroupApidReservedMemory = 16 * 1024 * 1024

// CgroupApidMaxMemory is the hard memory limit for the apid process.
CgroupApidMaxMemory = 40 * 1024 * 1024

// CgroupTrustd is the cgroup name for trustd runtime processes.
CgroupTrustd = CgroupSystem + "/trustd"

// CgroupTrustdReservedMemory is the hard memory protection for the trustd processes.
CgroupTrustdReservedMemory = 8 * 1024 * 1024

// CgroupTrustdMaxMemory is the hard memory limit for the trustd process.
CgroupTrustdMaxMemory = 24 * 1024 * 1024

// CgroupUdevd is the cgroup name for udevd runtime processes.
CgroupUdevd = CgroupSystem + "/udevd"

// CgroupUdevdReservedMemory is the hard memory protection for the udevd processes.
CgroupUdevdReservedMemory = 8 * 1024 * 1024

// CgroupExtensions is the cgroup name for system extension processes.
CgroupExtensions = CgroupSystem + "/extensions"

// CgroupDashboard is the cgroup name for dashboard process.
CgroupDashboard = CgroupSystem + "/dashboard"

// CgroupPodRuntimeRoot is the cgroup containing Kubernetes runtime components.
CgroupPodRuntimeRoot = "/podruntime"

// CgroupPodRuntimeRootCPUWeight is the CPU weight for the pod runtime cgroup.
CgroupPodRuntimeRootCPUWeight = 150

// CgroupPodRuntime is the cgroup name for kubernetes containerd runtime processes.
CgroupPodRuntime = "/podruntime/runtime"
CgroupPodRuntime = CgroupPodRuntimeRoot + "/runtime"

// CgroupPodRuntimeCPUWeight is the CPU weight for the pod runtime cgroup.
CgroupPodRuntimeCPUWeight = 150

// CgroupPodRuntimeReservedMemory is the hard memory protection for the cri runtime processes.
CgroupPodRuntimeReservedMemory = 128 * 1024 * 1024

// CgroupEtcd is the cgroup name for etcd process.
CgroupEtcd = "/podruntime/etcd"
CgroupEtcd = CgroupPodRuntimeRoot + "/etcd"

// CgroupEtcdReservedMemory is the soft memory protection for the etcd processes.
CgroupEtcdReservedMemory = 256 * 1024 * 1024

// CgroupKubelet is the cgroup name for kubelet process.
CgroupKubelet = "/podruntime/kubelet"
CgroupKubelet = CgroupPodRuntimeRoot + "/kubelet"

// CgroupKubeletReservedMemory is the hard memory protection for the kubelet processes.
CgroupKubeletReservedMemory = 64 * 1024 * 1024
CgroupKubeletReservedMemory = 96 * 1024 * 1024

// CgroupKubeletCPUWeight is the CPU weight for the kubelet process.
CgroupKubeletCPUWeight = 150

// CgroupDashboardReservedMemory is the hard memory protection for the dashboard process.
CgroupDashboardReservedMemory = 85 * 1024 * 1024
// CgroupDashboardMaxMemory is the hard memory limit for the dashboard process.
CgroupDashboardMaxMemory = 196 * 1024 * 1024

// CgroupDashboardLowMemory is the low memory value for the dashboard process.
CgroupDashboardLowMemory = 100 * 1024 * 1024
// CgroupDashboardCPUWeight is the CPU weight for the dashboard process.
CgroupDashboardCPUWeight = 10

// FlannelCNI is the string to use Tanos-managed Flannel CNI (default).
FlannelCNI = "flannel"
Expand Down

0 comments on commit ed43698

Please sign in to comment.