From ed436980a064db3f7318c48a327da30551fb392f Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Thu, 19 Sep 2024 21:23:16 +0400 Subject: [PATCH] fix: audit and fix cgroup reservations Fixes: #7081 Review all reservations and limits set, test under stress load (using both memory and CPU). The goal: system components (Talos itself) and runtime (kubelet, CRI) should survive under extreme resource starvation (workloads consuming all CPU/memory). Uses #9337 to visualize changes, but doesn't depend on it. Signed-off-by: Andrey Smirnov --- .../v1alpha1/v1alpha1_sequencer_tasks.go | 64 +++++++++++++++++-- .../pkg/system/runner/containerd/opts.go | 12 ---- .../app/machined/pkg/system/runner/runner.go | 20 ++++++ .../app/machined/pkg/system/services/apid.go | 2 + .../machined/pkg/system/services/dashboard.go | 2 + .../app/machined/pkg/system/services/etcd.go | 1 + .../machined/pkg/system/services/trustd.go | 7 +- internal/pkg/mount/cgroups.go | 2 +- pkg/machinery/constants/constants.go | 55 +++++++++++++--- 9 files changed, 136 insertions(+), 29 deletions(-) diff --git a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go index e7986a006f..fe49d99837 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go @@ -182,6 +182,9 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri Min: pointer.To[int64](constants.CgroupInitReservedMemory), Low: pointer.To[int64](constants.CgroupInitReservedMemory * 2), }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](constants.CgroupInitCPUWeight), + }, }, }, { @@ -191,15 +194,36 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri Min: pointer.To[int64](constants.CgroupSystemReservedMemory), Low: pointer.To[int64](constants.CgroupSystemReservedMemory * 2), }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](constants.CgroupSystemCPUWeight), + }, }, }, { - name: constants.CgroupSystemRuntime, - resources: &cgroup2.Resources{}, + name: constants.CgroupSystemRuntime, + resources: &cgroup2.Resources{ + Memory: &cgroup2.Memory{ + Min: pointer.To[int64](constants.CgroupSystemRuntimeReservedMemory), + Low: pointer.To[int64](constants.CgroupSystemRuntimeReservedMemory * 2), + }, + }, }, { - name: constants.CgroupUdevd, - resources: &cgroup2.Resources{}, + name: constants.CgroupUdevd, + resources: &cgroup2.Resources{ + Memory: &cgroup2.Memory{ + Min: pointer.To[int64](constants.CgroupUdevdReservedMemory), + Low: pointer.To[int64](constants.CgroupUdevdReservedMemory * 2), + }, + }, + }, + { + name: constants.CgroupPodRuntimeRoot, + resources: &cgroup2.Resources{ + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](constants.CgroupPodRuntimeRootCPUWeight), + }, + }, }, { name: constants.CgroupPodRuntime, @@ -208,6 +232,9 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri Min: pointer.To[int64](constants.CgroupPodRuntimeReservedMemory), Low: pointer.To[int64](constants.CgroupPodRuntimeReservedMemory * 2), }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](constants.CgroupPodRuntimeCPUWeight), + }, }, }, { @@ -217,14 +244,39 @@ func CreateSystemCgroups(runtime.Sequence, any) (runtime.TaskExecutionFunc, stri Min: pointer.To[int64](constants.CgroupKubeletReservedMemory), Low: pointer.To[int64](constants.CgroupKubeletReservedMemory * 2), }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](constants.CgroupKubeletCPUWeight), + }, }, }, { name: constants.CgroupDashboard, resources: &cgroup2.Resources{ Memory: &cgroup2.Memory{ - Min: pointer.To[int64](constants.CgroupDashboardReservedMemory), - Low: pointer.To[int64](constants.CgroupDashboardLowMemory), + Max: pointer.To[int64](constants.CgroupDashboardMaxMemory), + }, + CPU: &cgroup2.CPU{ + Weight: pointer.To[uint64](constants.CgroupDashboardCPUWeight), + }, + }, + }, + { + name: constants.CgroupApid, + resources: &cgroup2.Resources{ + Memory: &cgroup2.Memory{ + Min: pointer.To[int64](constants.CgroupApidReservedMemory), + Low: pointer.To[int64](constants.CgroupApidReservedMemory * 2), + Max: pointer.To[int64](constants.CgroupApidMaxMemory), + }, + }, + }, + { + name: constants.CgroupTrustd, + resources: &cgroup2.Resources{ + Memory: &cgroup2.Memory{ + Min: pointer.To[int64](constants.CgroupTrustdReservedMemory), + Low: pointer.To[int64](constants.CgroupTrustdReservedMemory * 2), + Max: pointer.To[int64](constants.CgroupTrustdMaxMemory), }, }, }, diff --git a/internal/app/machined/pkg/system/runner/containerd/opts.go b/internal/app/machined/pkg/system/runner/containerd/opts.go index b0f36efe48..ef466f1623 100644 --- a/internal/app/machined/pkg/system/runner/containerd/opts.go +++ b/internal/app/machined/pkg/system/runner/containerd/opts.go @@ -13,18 +13,6 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" ) -// WithMemoryLimit sets the linux resource memory limit field. -func WithMemoryLimit(limit int64) oci.SpecOpts { - return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error { - s.Linux.Resources.Memory = &specs.LinuxMemory{ - Limit: &limit, - // DisableOOMKiller: &disable, - } - - return nil - } -} - // WithRootfsPropagation sets the root filesystem propagation. func WithRootfsPropagation(rp string) oci.SpecOpts { return func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error { diff --git a/internal/app/machined/pkg/system/runner/runner.go b/internal/app/machined/pkg/system/runner/runner.go index a9e6735c42..96eed7e3ba 100644 --- a/internal/app/machined/pkg/system/runner/runner.go +++ b/internal/app/machined/pkg/system/runner/runner.go @@ -6,15 +6,18 @@ package runner import ( + "context" "fmt" "io" "time" containerd "github.com/containerd/containerd/v2/client" + ocicontainers "github.com/containerd/containerd/v2/core/containers" "github.com/containerd/containerd/v2/pkg/oci" "github.com/opencontainers/runtime-spec/specs-go" "github.com/siderolabs/gen/maps" "github.com/siderolabs/gen/optional" + "github.com/siderolabs/go-pointer" "github.com/siderolabs/talos/internal/app/machined/pkg/runtime" "github.com/siderolabs/talos/internal/app/machined/pkg/runtime/logging" @@ -220,3 +223,20 @@ func WithUID(uid uint32) Option { args.UID = uid } } + +// WithMemoryReservation sets the memory reservation limit as on OCI spec. +func WithMemoryReservation(limit uint64) oci.SpecOpts { + return func(_ context.Context, _ oci.Client, _ *ocicontainers.Container, s *oci.Spec) error { + if s.Linux.Resources == nil { + s.Linux.Resources = &specs.LinuxResources{} + } + + if s.Linux.Resources.Memory == nil { + s.Linux.Resources.Memory = &specs.LinuxMemory{} + } + + s.Linux.Resources.Memory.Reservation = pointer.To(int64(limit)) + + return nil + } +} diff --git a/internal/app/machined/pkg/system/services/apid.go b/internal/app/machined/pkg/system/services/apid.go index 1f8e55547d..da7f72bf39 100644 --- a/internal/app/machined/pkg/system/services/apid.go +++ b/internal/app/machined/pkg/system/services/apid.go @@ -12,6 +12,7 @@ import ( "net" "os" "path/filepath" + "strconv" "strings" "github.com/containerd/containerd/v2/pkg/cap" @@ -164,6 +165,7 @@ func (o *APID) Runner(r runtime.Runtime) (runner.Runner, error) { env := []string{ constants.TcellMinimizeEnvironment, + "GOMEMLIMIT=" + strconv.Itoa(constants.CgroupApidMaxMemory/5*4), } for _, value := range environment.Get(r.Config()) { diff --git a/internal/app/machined/pkg/system/services/dashboard.go b/internal/app/machined/pkg/system/services/dashboard.go index 05aa6ddabb..0f5100a6d8 100644 --- a/internal/app/machined/pkg/system/services/dashboard.go +++ b/internal/app/machined/pkg/system/services/dashboard.go @@ -8,6 +8,7 @@ package services import ( "context" "fmt" + "strconv" "github.com/siderolabs/talos/internal/app/machined/pkg/runtime" "github.com/siderolabs/talos/internal/app/machined/pkg/system/events" @@ -61,6 +62,7 @@ func (d *Dashboard) Runner(r runtime.Runtime) (runner.Runner, error) { runner.WithEnv([]string{ "TERM=linux", constants.TcellMinimizeEnvironment, + "GOMEMLIMIT=" + strconv.Itoa(constants.CgroupDashboardMaxMemory/5*4), }), runner.WithStdinFile(tty), runner.WithStdoutFile(tty), diff --git a/internal/app/machined/pkg/system/services/etcd.go b/internal/app/machined/pkg/system/services/etcd.go index 41a8b79638..894c2b02bd 100644 --- a/internal/app/machined/pkg/system/services/etcd.go +++ b/internal/app/machined/pkg/system/services/etcd.go @@ -224,6 +224,7 @@ func (e *Etcd) Runner(r runtime.Runtime) (runner.Runner, error) { oci.WithHostNamespace(specs.NetworkNamespace), oci.WithMounts(mounts), oci.WithUser(fmt.Sprintf("%d:%d", constants.EtcdUserID, constants.EtcdUserID)), + runner.WithMemoryReservation(constants.CgroupEtcdReservedMemory), ), runner.WithOOMScoreAdj(-998), ), diff --git a/internal/app/machined/pkg/system/services/trustd.go b/internal/app/machined/pkg/system/services/trustd.go index 7b7a47676a..36e4ab5b09 100644 --- a/internal/app/machined/pkg/system/services/trustd.go +++ b/internal/app/machined/pkg/system/services/trustd.go @@ -12,6 +12,7 @@ import ( "net" "os" "path/filepath" + "strconv" "github.com/containerd/containerd/v2/pkg/cap" "github.com/containerd/containerd/v2/pkg/oci" @@ -142,7 +143,10 @@ func (t *Trustd) Runner(r runtime.Runtime) (runner.Runner, error) { } env := environment.Get(r.Config()) - env = append(env, constants.TcellMinimizeEnvironment) + env = append(env, + constants.TcellMinimizeEnvironment, + "GOMEMLIMIT="+strconv.Itoa(constants.CgroupTrustdMaxMemory/5*4), + ) if debug.RaceEnabled { env = append(env, "GORACE=halt_on_error=1") @@ -156,7 +160,6 @@ func (t *Trustd) Runner(r runtime.Runtime) (runner.Runner, error) { runner.WithEnv(env), runner.WithCgroupPath(constants.CgroupTrustd), runner.WithOCISpecOpts( - containerd.WithMemoryLimit(int64(1000000*512)), oci.WithDroppedCapabilities(cap.Known()), oci.WithHostNamespace(specs.NetworkNamespace), oci.WithMounts(mounts), diff --git a/internal/pkg/mount/cgroups.go b/internal/pkg/mount/cgroups.go index 0214ff108d..9924eea174 100644 --- a/internal/pkg/mount/cgroups.go +++ b/internal/pkg/mount/cgroups.go @@ -32,7 +32,7 @@ func CGroupMountPoints() (mountpoints *Points, err error) { func cgroupMountPointsV2() (mountpoints *Points, err error) { cgroups := NewMountPoints() - cgroups.Set("cgroup2", NewMountPoint("cgroup", constants.CgroupMountPath, "cgroup2", unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_RELATIME, "nsdelegate")) + cgroups.Set("cgroup2", NewMountPoint("cgroup", constants.CgroupMountPath, "cgroup2", unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_RELATIME, "nsdelegate,memory_recursiveprot")) return cgroups, nil } diff --git a/pkg/machinery/constants/constants.go b/pkg/machinery/constants/constants.go index ff75c752a7..088883695d 100644 --- a/pkg/machinery/constants/constants.go +++ b/pkg/machinery/constants/constants.go @@ -672,50 +672,89 @@ const ( // CgroupInitReservedMemory is the hard memory protection for the init process. CgroupInitReservedMemory = 96 * 1024 * 1024 + // CgroupInitCPUWeight is the CPU weight for the init process. + CgroupInitCPUWeight = 300 + // CgroupSystem is the cgroup name for system processes. CgroupSystem = "/system" + // CgroupSystemCPUWeight is the CPU weight for the system cgroup. + CgroupSystemCPUWeight = 150 + // CgroupSystemReservedMemory is the hard memory protection for the system processes. CgroupSystemReservedMemory = 96 * 1024 * 1024 // CgroupSystemRuntime is the cgroup name for containerd runtime processes. CgroupSystemRuntime = CgroupSystem + "/runtime" + // CgroupSystemRuntimeReservedMemory is the hard memory protection for the system containerd process. + CgroupSystemRuntimeReservedMemory = 48 * 1024 * 1024 + // CgroupApid is the cgroup name for apid runtime processes. CgroupApid = CgroupSystem + "/apid" + // CgroupApidReservedMemory is the hard memory protection for the apid processes. + CgroupApidReservedMemory = 16 * 1024 * 1024 + + // CgroupApidMaxMemory is the hard memory limit for the apid process. + CgroupApidMaxMemory = 40 * 1024 * 1024 + // CgroupTrustd is the cgroup name for trustd runtime processes. CgroupTrustd = CgroupSystem + "/trustd" + // CgroupTrustdReservedMemory is the hard memory protection for the trustd processes. + CgroupTrustdReservedMemory = 8 * 1024 * 1024 + + // CgroupTrustdMaxMemory is the hard memory limit for the trustd process. + CgroupTrustdMaxMemory = 24 * 1024 * 1024 + // CgroupUdevd is the cgroup name for udevd runtime processes. CgroupUdevd = CgroupSystem + "/udevd" + // CgroupUdevdReservedMemory is the hard memory protection for the udevd processes. + CgroupUdevdReservedMemory = 8 * 1024 * 1024 + // CgroupExtensions is the cgroup name for system extension processes. CgroupExtensions = CgroupSystem + "/extensions" // CgroupDashboard is the cgroup name for dashboard process. CgroupDashboard = CgroupSystem + "/dashboard" + // CgroupPodRuntimeRoot is the cgroup containing Kubernetes runtime components. + CgroupPodRuntimeRoot = "/podruntime" + + // CgroupPodRuntimeRootCPUWeight is the CPU weight for the pod runtime cgroup. + CgroupPodRuntimeRootCPUWeight = 150 + // CgroupPodRuntime is the cgroup name for kubernetes containerd runtime processes. - CgroupPodRuntime = "/podruntime/runtime" + CgroupPodRuntime = CgroupPodRuntimeRoot + "/runtime" + + // CgroupPodRuntimeCPUWeight is the CPU weight for the pod runtime cgroup. + CgroupPodRuntimeCPUWeight = 150 // CgroupPodRuntimeReservedMemory is the hard memory protection for the cri runtime processes. CgroupPodRuntimeReservedMemory = 128 * 1024 * 1024 // CgroupEtcd is the cgroup name for etcd process. - CgroupEtcd = "/podruntime/etcd" + CgroupEtcd = CgroupPodRuntimeRoot + "/etcd" + + // CgroupEtcdReservedMemory is the soft memory protection for the etcd processes. + CgroupEtcdReservedMemory = 256 * 1024 * 1024 // CgroupKubelet is the cgroup name for kubelet process. - CgroupKubelet = "/podruntime/kubelet" + CgroupKubelet = CgroupPodRuntimeRoot + "/kubelet" // CgroupKubeletReservedMemory is the hard memory protection for the kubelet processes. - CgroupKubeletReservedMemory = 64 * 1024 * 1024 + CgroupKubeletReservedMemory = 96 * 1024 * 1024 + + // CgroupKubeletCPUWeight is the CPU weight for the kubelet process. + CgroupKubeletCPUWeight = 150 - // CgroupDashboardReservedMemory is the hard memory protection for the dashboard process. - CgroupDashboardReservedMemory = 85 * 1024 * 1024 + // CgroupDashboardMaxMemory is the hard memory limit for the dashboard process. + CgroupDashboardMaxMemory = 196 * 1024 * 1024 - // CgroupDashboardLowMemory is the low memory value for the dashboard process. - CgroupDashboardLowMemory = 100 * 1024 * 1024 + // CgroupDashboardCPUWeight is the CPU weight for the dashboard process. + CgroupDashboardCPUWeight = 10 // FlannelCNI is the string to use Tanos-managed Flannel CNI (default). FlannelCNI = "flannel"