From 40feb1653172ff458a51a228a50dd4e1e7f8299a Mon Sep 17 00:00:00 2001 From: Roman Penyaev Date: Sat, 21 Sep 2024 16:29:57 +0200 Subject: [PATCH 1/2] pillar: configure GOGC based on pillar memory limit or global config Patch introduces two settings for Golang runtime which impacts garbage collector behavior: 1. `gogc.memory.limit.bytes` provides the runtime with a soft memory limit. The runtime undertakes several processes to try to respect this memory limit, including adjustments to the frequency of garbage collections and returning memory to the underlying system more aggressively. The Go API call is described here: https://pkg.go.dev/runtime/debug#SetMemoryLimit By default, EVE setting is disabled (set to 0), meaning the Golang runtime memory limit will be set according to the following equation based on the `memory.limit_in_bytes` hard memory limit provided by the pillar `cgroups`: `limit = memory.limit_in_bytes * 0.6` The constant 0.6 was chosen empirically and is explained by simple logic: `memory.limit_in_bytes` is a hard limit for the whole pillar cgroup, meaning when reached, likely one of the processes will be killed by OOM. In turn Golang runtime memory limit is a soft limit, so the difference must be significant to ensure that after the soft limit is reached, there will be enough memory for the Go garbage collector to do its job and, fortunately, not to hit the hard limit. 2. `gogc.percent` sets the garbage collection target percentage: a collection is triggered when the ratio of freshly allocated data to live data remaining after the previous collection reaches this percentage. The Go API call is described here: https://pkg.go.dev/runtime/debug#SetGCPercent The patch is motivated by a frequently observed bloated `zedbox` application (up to 500MB) that causes an OOM kill call to the /eve or /pillar cgroups. It is assumed that the bloated `zedbox` application is not caused by memory leaks, but by a delayed GC sweep cycle and a unconditionally growing runtime heap size. An explicit memory limit set for the Golang runtime (~400MB in the current version of EVE) should make the GC more aggressive when the soft memory limit is hit, which should result in a significant reduction in allocated but unused memory. Signed-off-by: Roman Penyaev --- pkg/pillar/cmd/zedmanager/zedmanager.go | 12 ++++++++ pkg/pillar/types/global.go | 9 +++++- pkg/pillar/types/global_test.go | 2 ++ pkg/pillar/types/locationconsts.go | 2 ++ pkg/pillar/types/memory.go | 41 +++++++++++++++++++++++++ 5 files changed, 65 insertions(+), 1 deletion(-) diff --git a/pkg/pillar/cmd/zedmanager/zedmanager.go b/pkg/pillar/cmd/zedmanager/zedmanager.go index d4c3a81503..20787a6896 100644 --- a/pkg/pillar/cmd/zedmanager/zedmanager.go +++ b/pkg/pillar/cmd/zedmanager/zedmanager.go @@ -1422,6 +1422,17 @@ func quantifyChanges(config types.AppInstanceConfig, oldConfig types.AppInstance return needPurge, needRestart, purgeReason, restartReason } +func configureGOGC(gcp *types.ConfigItemValueMap) { + lim := gcp.GlobalValueInt(types.GOGCMemoryLimitInBytes) + per := gcp.GlobalValueInt(types.GOGCPercent) + plim, pper, err := types.ConfigureGOGC(int64(lim), int(per)) + if err != nil { + log.Warningf("configureGOGC: failed '%v'", err) + } else { + log.Functionf("configureGOGC: memory limit set to '%v' (previous '%v'), GC percent set to '%v' (previous '%v')", lim, plim, per, pper) + } +} + func handleGlobalConfigCreate(ctxArg interface{}, key string, statusArg interface{}) { handleGlobalConfigImpl(ctxArg, key, statusArg) @@ -1447,6 +1458,7 @@ func handleGlobalConfigImpl(ctxArg interface{}, key string, ctx.globalConfig = gcp ctx.GCInitialized = true } + configureGOGC(gcp) log.Functionf("handleGlobalConfigImpl done for %s", key) } diff --git a/pkg/pillar/types/global.go b/pkg/pillar/types/global.go index 32b7161314..8bf82e6417 100644 --- a/pkg/pillar/types/global.go +++ b/pkg/pillar/types/global.go @@ -231,6 +231,10 @@ const ( EveMemoryLimitInBytes GlobalSettingKey = "memory.eve.limit.bytes" // How much memory overhead is allowed for VMM needs VmmMemoryLimitInMiB GlobalSettingKey = "memory.vmm.limit.MiB" + // GOGCMemoryLimitInBytes global setting key + GOGCMemoryLimitInBytes GlobalSettingKey = "gogc.memory.limit.bytes" + // GOGCPercent global setting key + GOGCPercent GlobalSettingKey = "gogc.percent" // IgnoreMemoryCheckForApps global setting key IgnoreMemoryCheckForApps GlobalSettingKey = "memory.apps.ignore.check" // IgnoreDiskCheckForApps global setting key @@ -892,7 +896,10 @@ func NewConfigItemSpecMap() ConfigItemSpecMap { 100*1024*1024, 0xFFFFFFFF) configItemSpecMap.AddIntItem(StorageZfsReserved, 20, 1, 99) configItemSpecMap.AddIntItem(ForceFallbackCounter, 0, 0, 0xFFFFFFFF) - + // Default GOGC memory limit is 0 + configItemSpecMap.AddIntItem(GOGCMemoryLimitInBytes, 0, 0, 0xFFFFFFFF) + // Default GOGC target percentage is 100, 0 means disable GC + configItemSpecMap.AddIntItem(GOGCPercent, 100, 0, 500) configItemSpecMap.AddIntItem(EveMemoryLimitInBytes, uint32(eveMemoryLimitInBytes), uint32(eveMemoryLimitInBytes), 0xFFFFFFFF) // Limit manual vmm overhead override to 1 PiB diff --git a/pkg/pillar/types/global_test.go b/pkg/pillar/types/global_test.go index 4ea67ab11b..948c1c114a 100644 --- a/pkg/pillar/types/global_test.go +++ b/pkg/pillar/types/global_test.go @@ -183,6 +183,8 @@ func TestNewConfigItemSpecMap(t *testing.T) { ConsoleAccess, VncShimVMAccess, AllowAppVnc, + GOGCMemoryLimitInBytes, + GOGCPercent, EveMemoryLimitInBytes, VmmMemoryLimitInMiB, IgnoreMemoryCheckForApps, diff --git a/pkg/pillar/types/locationconsts.go b/pkg/pillar/types/locationconsts.go index 5d36f443e0..4ce9298d4b 100644 --- a/pkg/pillar/types/locationconsts.go +++ b/pkg/pillar/types/locationconsts.go @@ -98,6 +98,8 @@ const ( NewlogUploadAppDir = NewlogDir + "/appUpload" // NewlogKeepSentQueueDir - a circular queue of gzip files already been sent NewlogKeepSentQueueDir = NewlogDir + "/keepSentQueue" + // PillarHardMemoryLimitFile - hard memory reserved for pillar + PillarHardMemoryLimitFile = "/hostfs/sys/fs/cgroup/memory/eve/services/pillar/memory.limit_in_bytes" // EveMemoryLimitFile - stores memory reserved for eve EveMemoryLimitFile = "/hostfs/sys/fs/cgroup/memory/eve/memory.soft_limit_in_bytes" // EveMemoryUsageFile - current usage diff --git a/pkg/pillar/types/memory.go b/pkg/pillar/types/memory.go index accaff8514..a52ff042cd 100644 --- a/pkg/pillar/types/memory.go +++ b/pkg/pillar/types/memory.go @@ -4,11 +4,19 @@ package types import ( + "fmt" "os" + "runtime/debug" "strconv" "strings" ) +// GetPillarHardMemoryLimitInBytes returns hard memory limit +// reserved for pillar in bytes +func GetPillarHardMemoryLimitInBytes() (uint64, error) { + return readUint64File(PillarHardMemoryLimitFile) +} + // GetEveMemoryLimitInBytes returns memory limit // reserved for eve in bytes func GetEveMemoryLimitInBytes() (uint64, error) { @@ -42,3 +50,36 @@ func readUint64File(filename string) (uint64, error) { dataUint64, err := strconv.ParseUint(dataString, 10, 64) return dataUint64, err } + +// ConfigureGOGC sets two main configuration parameters for the +// garbage collector (GOGC): memory limit and percentage (see +// explanation here: https://tip.golang.org/doc/gc-guide). +// If limit is 0, create GOGC limit from the pillar cgroups hard +// memory limit. +func ConfigureGOGC(limit int64, percent int) (int64, int, error) { + if limit == 0 { + // Fallback to value from cgroups if no limit in the configuration + ulimit, err := GetPillarHardMemoryLimitInBytes() + if err != nil { + err := fmt.Errorf("can't receive pillar memory hard limit: '%w'", err) + return -1, -1, err + } + // Reduce actual memory limit to 0.6 of cgroup limit. The logic behind + // the constant is simple: cgroup limit is a hard limit for the whole + // pillar cgroup, meaning when reached, we are killed by OOM. In turn + // GOGC memory limit is a soft limit, so the difference must be + // significant to ensure that after the soft limit is reached, there + // will be enough memory for the GOGC to do its job and, fortunately, + // not to hit the hard limit. + limit = int64(ulimit) * 600 / 1000 + } + if percent == 0 { + // Disable GC + percent = -1 + } + // Set new and retrieve previous values + limit = debug.SetMemoryLimit(limit) + percent = debug.SetGCPercent(percent) + + return limit, percent, nil +} From cf3f6838d34f7b5edc2a9f374b360f8920d30d6b Mon Sep 17 00:00:00 2001 From: Roman Penyaev Date: Sun, 22 Sep 2024 16:35:09 +0200 Subject: [PATCH 2/2] docs: describe GOGC memory limit and percent settings Documentation for recent: gogc.memory.limit.bytes gogc.percent Golang runtime settings. Signed-off-by: Roman Penyaev --- docs/CONFIG-PROPERTIES.md | 2 ++ docs/MEMORY-SETTINGS.md | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/docs/CONFIG-PROPERTIES.md b/docs/CONFIG-PROPERTIES.md index 55df7fb0b6..dd2691b015 100644 --- a/docs/CONFIG-PROPERTIES.md +++ b/docs/CONFIG-PROPERTIES.md @@ -48,6 +48,8 @@ | newlog.allow.fastupload | boolean | false | allow faster upload gzip logfiles to controller | | memory.apps.ignore.check | boolean | false | Ignore memory usage check for Apps| | memory.vmm.limit.MiB | integer | 0 | Manually override how much overhead is allocated for each running VMM | +| gogc.memory.limit.bytes | integer | 0 | Golang runtime soft memory limit, see details in API doc ["https://pkg.go.dev/runtime/debug#SetMemoryLimit"] | +| gogc.percent | integer | 100 | Golang runtime garbage collector target percentage, see details in API doc ["https://pkg.go.dev/runtime/debug#SetGCPercent"] | | newlog.gzipfiles.ondisk.maxmegabytes | integer in Mbytes | 2048 | the quota for keepig newlog gzip files on device | | process.cloud-init.multipart | boolean | false | help VMs which do not handle mime multi-part themselves | | netdump.enable | boolean | true | enable publishing of network diagnostics (as tgz archives to /persist/netdump) | diff --git a/docs/MEMORY-SETTINGS.md b/docs/MEMORY-SETTINGS.md index 0eb1f6e8c8..7eaa4c17d5 100644 --- a/docs/MEMORY-SETTINGS.md +++ b/docs/MEMORY-SETTINGS.md @@ -186,6 +186,40 @@ eve config unmount /mnt reboot ``` +## Golang runtime garbage collector settings + +Golang runtime provides two parameters which impacts garbage collector (GC) +behavior, which are available through the EVE debug settings: + +1. `gogc.memory.limit.bytes` provides the runtime with a soft memory limit. + The runtime undertakes several processes to try to respect this memory + limit, including adjustments to the frequency of garbage collections and + returning memory to the underlying system more aggressively. The Go API + call is described [here](https://pkg.go.dev/runtime/debug#SetMemoryLimit) + + By default, EVE setting is disabled (set to 0), meaning the Golang runtime + memory limit will be set according to the following equation based on the + `memory.limit_in_bytes` hard memory limit provided by the pillar `cgroups`: + + `limit = memory.limit_in_bytes * 0.6` + + The constant 0.6 was chosen empirically and is explained by simple logic: + `memory.limit_in_bytes` is a hard limit for the whole pillar cgroup, meaning + when reached, likely one of the processes will be killed by OOM. In turn + Golang runtime memory limit is a soft limit, so the difference must be + significant to ensure that after the soft limit is reached, there will be + enough memory for the Go garbage collector to do its job and, fortunately, + not to hit the hard limit. + +2. `gogc.percent` sets the garbage collection target percentage: a collection + is triggered when the ratio of freshly allocated data to live data remaining + after the previous collection reaches this percentage. The Go API call is + described [here](https://pkg.go.dev/runtime/debug#SetGCPercent) + +Changing these parameters is recommended as a last resort, for example to debug +an OOM kill due to a bloated `zedbox` process. Before changing the values, +please read the [documentation](https://tip.golang.org/doc/gc-guide) carefully. + ## User applications memory settings Besides the obvious memory settings of RAM that comes from the controller, there