diff --git a/docs/CONFIG-PROPERTIES.md b/docs/CONFIG-PROPERTIES.md index 55df7fb0b6..dd2691b015 100644 --- a/docs/CONFIG-PROPERTIES.md +++ b/docs/CONFIG-PROPERTIES.md @@ -48,6 +48,8 @@ | newlog.allow.fastupload | boolean | false | allow faster upload gzip logfiles to controller | | memory.apps.ignore.check | boolean | false | Ignore memory usage check for Apps| | memory.vmm.limit.MiB | integer | 0 | Manually override how much overhead is allocated for each running VMM | +| gogc.memory.limit.bytes | integer | 0 | Golang runtime soft memory limit, see details in API doc ["https://pkg.go.dev/runtime/debug#SetMemoryLimit"] | +| gogc.percent | integer | 100 | Golang runtime garbage collector target percentage, see details in API doc ["https://pkg.go.dev/runtime/debug#SetGCPercent"] | | newlog.gzipfiles.ondisk.maxmegabytes | integer in Mbytes | 2048 | the quota for keepig newlog gzip files on device | | process.cloud-init.multipart | boolean | false | help VMs which do not handle mime multi-part themselves | | netdump.enable | boolean | true | enable publishing of network diagnostics (as tgz archives to /persist/netdump) | diff --git a/docs/MEMORY-SETTINGS.md b/docs/MEMORY-SETTINGS.md index 0eb1f6e8c8..7eaa4c17d5 100644 --- a/docs/MEMORY-SETTINGS.md +++ b/docs/MEMORY-SETTINGS.md @@ -186,6 +186,40 @@ eve config unmount /mnt reboot ``` +## Golang runtime garbage collector settings + +Golang runtime provides two parameters which impacts garbage collector (GC) +behavior, which are available through the EVE debug settings: + +1. `gogc.memory.limit.bytes` provides the runtime with a soft memory limit. + The runtime undertakes several processes to try to respect this memory + limit, including adjustments to the frequency of garbage collections and + returning memory to the underlying system more aggressively. The Go API + call is described [here](https://pkg.go.dev/runtime/debug#SetMemoryLimit) + + By default, EVE setting is disabled (set to 0), meaning the Golang runtime + memory limit will be set according to the following equation based on the + `memory.limit_in_bytes` hard memory limit provided by the pillar `cgroups`: + + `limit = memory.limit_in_bytes * 0.6` + + The constant 0.6 was chosen empirically and is explained by simple logic: + `memory.limit_in_bytes` is a hard limit for the whole pillar cgroup, meaning + when reached, likely one of the processes will be killed by OOM. In turn + Golang runtime memory limit is a soft limit, so the difference must be + significant to ensure that after the soft limit is reached, there will be + enough memory for the Go garbage collector to do its job and, fortunately, + not to hit the hard limit. + +2. `gogc.percent` sets the garbage collection target percentage: a collection + is triggered when the ratio of freshly allocated data to live data remaining + after the previous collection reaches this percentage. The Go API call is + described [here](https://pkg.go.dev/runtime/debug#SetGCPercent) + +Changing these parameters is recommended as a last resort, for example to debug +an OOM kill due to a bloated `zedbox` process. Before changing the values, +please read the [documentation](https://tip.golang.org/doc/gc-guide) carefully. + ## User applications memory settings Besides the obvious memory settings of RAM that comes from the controller, there diff --git a/pkg/pillar/cmd/zedmanager/zedmanager.go b/pkg/pillar/cmd/zedmanager/zedmanager.go index d4c3a81503..20787a6896 100644 --- a/pkg/pillar/cmd/zedmanager/zedmanager.go +++ b/pkg/pillar/cmd/zedmanager/zedmanager.go @@ -1422,6 +1422,17 @@ func quantifyChanges(config types.AppInstanceConfig, oldConfig types.AppInstance return needPurge, needRestart, purgeReason, restartReason } +func configureGOGC(gcp *types.ConfigItemValueMap) { + lim := gcp.GlobalValueInt(types.GOGCMemoryLimitInBytes) + per := gcp.GlobalValueInt(types.GOGCPercent) + plim, pper, err := types.ConfigureGOGC(int64(lim), int(per)) + if err != nil { + log.Warningf("configureGOGC: failed '%v'", err) + } else { + log.Functionf("configureGOGC: memory limit set to '%v' (previous '%v'), GC percent set to '%v' (previous '%v')", lim, plim, per, pper) + } +} + func handleGlobalConfigCreate(ctxArg interface{}, key string, statusArg interface{}) { handleGlobalConfigImpl(ctxArg, key, statusArg) @@ -1447,6 +1458,7 @@ func handleGlobalConfigImpl(ctxArg interface{}, key string, ctx.globalConfig = gcp ctx.GCInitialized = true } + configureGOGC(gcp) log.Functionf("handleGlobalConfigImpl done for %s", key) } diff --git a/pkg/pillar/types/global.go b/pkg/pillar/types/global.go index 32b7161314..8bf82e6417 100644 --- a/pkg/pillar/types/global.go +++ b/pkg/pillar/types/global.go @@ -231,6 +231,10 @@ const ( EveMemoryLimitInBytes GlobalSettingKey = "memory.eve.limit.bytes" // How much memory overhead is allowed for VMM needs VmmMemoryLimitInMiB GlobalSettingKey = "memory.vmm.limit.MiB" + // GOGCMemoryLimitInBytes global setting key + GOGCMemoryLimitInBytes GlobalSettingKey = "gogc.memory.limit.bytes" + // GOGCPercent global setting key + GOGCPercent GlobalSettingKey = "gogc.percent" // IgnoreMemoryCheckForApps global setting key IgnoreMemoryCheckForApps GlobalSettingKey = "memory.apps.ignore.check" // IgnoreDiskCheckForApps global setting key @@ -892,7 +896,10 @@ func NewConfigItemSpecMap() ConfigItemSpecMap { 100*1024*1024, 0xFFFFFFFF) configItemSpecMap.AddIntItem(StorageZfsReserved, 20, 1, 99) configItemSpecMap.AddIntItem(ForceFallbackCounter, 0, 0, 0xFFFFFFFF) - + // Default GOGC memory limit is 0 + configItemSpecMap.AddIntItem(GOGCMemoryLimitInBytes, 0, 0, 0xFFFFFFFF) + // Default GOGC target percentage is 100, 0 means disable GC + configItemSpecMap.AddIntItem(GOGCPercent, 100, 0, 500) configItemSpecMap.AddIntItem(EveMemoryLimitInBytes, uint32(eveMemoryLimitInBytes), uint32(eveMemoryLimitInBytes), 0xFFFFFFFF) // Limit manual vmm overhead override to 1 PiB diff --git a/pkg/pillar/types/global_test.go b/pkg/pillar/types/global_test.go index 4ea67ab11b..948c1c114a 100644 --- a/pkg/pillar/types/global_test.go +++ b/pkg/pillar/types/global_test.go @@ -183,6 +183,8 @@ func TestNewConfigItemSpecMap(t *testing.T) { ConsoleAccess, VncShimVMAccess, AllowAppVnc, + GOGCMemoryLimitInBytes, + GOGCPercent, EveMemoryLimitInBytes, VmmMemoryLimitInMiB, IgnoreMemoryCheckForApps, diff --git a/pkg/pillar/types/locationconsts.go b/pkg/pillar/types/locationconsts.go index 5d36f443e0..4ce9298d4b 100644 --- a/pkg/pillar/types/locationconsts.go +++ b/pkg/pillar/types/locationconsts.go @@ -98,6 +98,8 @@ const ( NewlogUploadAppDir = NewlogDir + "/appUpload" // NewlogKeepSentQueueDir - a circular queue of gzip files already been sent NewlogKeepSentQueueDir = NewlogDir + "/keepSentQueue" + // PillarHardMemoryLimitFile - hard memory reserved for pillar + PillarHardMemoryLimitFile = "/hostfs/sys/fs/cgroup/memory/eve/services/pillar/memory.limit_in_bytes" // EveMemoryLimitFile - stores memory reserved for eve EveMemoryLimitFile = "/hostfs/sys/fs/cgroup/memory/eve/memory.soft_limit_in_bytes" // EveMemoryUsageFile - current usage diff --git a/pkg/pillar/types/memory.go b/pkg/pillar/types/memory.go index accaff8514..a52ff042cd 100644 --- a/pkg/pillar/types/memory.go +++ b/pkg/pillar/types/memory.go @@ -4,11 +4,19 @@ package types import ( + "fmt" "os" + "runtime/debug" "strconv" "strings" ) +// GetPillarHardMemoryLimitInBytes returns hard memory limit +// reserved for pillar in bytes +func GetPillarHardMemoryLimitInBytes() (uint64, error) { + return readUint64File(PillarHardMemoryLimitFile) +} + // GetEveMemoryLimitInBytes returns memory limit // reserved for eve in bytes func GetEveMemoryLimitInBytes() (uint64, error) { @@ -42,3 +50,36 @@ func readUint64File(filename string) (uint64, error) { dataUint64, err := strconv.ParseUint(dataString, 10, 64) return dataUint64, err } + +// ConfigureGOGC sets two main configuration parameters for the +// garbage collector (GOGC): memory limit and percentage (see +// explanation here: https://tip.golang.org/doc/gc-guide). +// If limit is 0, create GOGC limit from the pillar cgroups hard +// memory limit. +func ConfigureGOGC(limit int64, percent int) (int64, int, error) { + if limit == 0 { + // Fallback to value from cgroups if no limit in the configuration + ulimit, err := GetPillarHardMemoryLimitInBytes() + if err != nil { + err := fmt.Errorf("can't receive pillar memory hard limit: '%w'", err) + return -1, -1, err + } + // Reduce actual memory limit to 0.6 of cgroup limit. The logic behind + // the constant is simple: cgroup limit is a hard limit for the whole + // pillar cgroup, meaning when reached, we are killed by OOM. In turn + // GOGC memory limit is a soft limit, so the difference must be + // significant to ensure that after the soft limit is reached, there + // will be enough memory for the GOGC to do its job and, fortunately, + // not to hit the hard limit. + limit = int64(ulimit) * 600 / 1000 + } + if percent == 0 { + // Disable GC + percent = -1 + } + // Set new and retrieve previous values + limit = debug.SetMemoryLimit(limit) + percent = debug.SetGCPercent(percent) + + return limit, percent, nil +}