Skip to content

Commit 15d3102

Browse files
committed
Add k8s global memory indicators
The kubelet will terminate end-user pods when the worker node has 'MemoryPressure' according to [1]. But confusingly, there exits two reasons for pods being evicted: - one is the whole machine's free memory is too low, - the other is k8s itself calculation[2], e.i. memory.available[3] is too low. To resolve such confusion for k8s users, collect and show k8s global workingset memory to distinguish between these two causes. Note: 1. Only collect k8s global memory stats is enough, this is because cgroupfs stats are propagated from child to parent. Thus the parent can always notice the change and then updates. And From v1.6 k8s[4], allocatable(/sys/fs/cgroup/memory/kubepods/) is more convincing than capacity(/sys/fs/cgroup/memory/). 2. There are two cgroup drivers or managers to control resources: cgroupfs and systemd[5]. We should take both into account. (The 'systemd' cgroup driver always ends with '.slice') 3. The difference between cgroupv1 and cgroupv2: different field names for memory.stat file, and memory.currentUsage storing in different files (cgv1's memory.usage_in_bytes v.s. cgv2's memory.current). [1]https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#node-out-of-memory-behavior [2]kubernetes/kubernetes#43916 [3]memory.available = memory.allocatable/capacity - memory.workingSet, memory.workingSet = memory.currentUsage - memory.inactivefile [4]kubernetes/kubernetes#42204 kubernetes/community#348 [5]https://kubernetes.io/docs/tasks/administer-cluster/kubeadm/configure-cgroup-driver/ Signed-off-by: Fei Li <[email protected]> Reported-by: Teng Hu <[email protected]>
1 parent 5f36fd7 commit 15d3102

File tree

7 files changed

+386
-0
lines changed

7 files changed

+386
-0
lines changed

deviate.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1455,6 +1455,17 @@ deviatsyst(struct sstat *cur, struct sstat *pre, struct sstat *dev,
14551455

14561456
dev->llc.nrllcs = cur->llc.nrllcs;
14571457

1458+
dev->k8smem.file = cur->k8smem.file;
1459+
dev->k8smem.anon = cur->k8smem.anon;
1460+
dev->k8smem.shmem = cur->k8smem.shmem;
1461+
dev->k8smem.filemapped = cur->k8smem.filemapped;
1462+
dev->k8smem.inactiveanon = cur->k8smem.inactiveanon;
1463+
dev->k8smem.activeanon = cur->k8smem.activeanon;
1464+
dev->k8smem.inactivefile = cur->k8smem.inactivefile;
1465+
dev->k8smem.activefile = cur->k8smem.activefile;
1466+
dev->k8smem.usagefile = cur->k8smem.usagefile;
1467+
dev->k8smem.workingset = cur->k8smem.workingset;
1468+
14581469
#if HTTPSTATS
14591470
/*
14601471
** application-specific counters

man/atop.1

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,6 +1343,23 @@ the number of memory pages the system wrote to swap space (`swout'), and
13431343
the number of out-of-memory kills (`oomkill').
13441344
.PP
13451345
.TP 5
1346+
.B K8S
1347+
K8S global /sys/fs/cgroup/[memory/]kubepods/memory.stat.
1348+
.br
1349+
This line shows the number of file pages for k8s global memcg (`file'),
1350+
the number of mapped anonymous pages for k8s global memcg (`anon'),
1351+
the number of shmem pages (included tmpfs/GEM pages) for k8s global
1352+
memcg (`shmem'), the number of pagecache pages mapped into pagetables
1353+
for k8s global memcg (`fmap'), the number of lru inactive anon pages
1354+
for k8s global memcg (`inan'), the number of lru active anon pages
1355+
for k8s global memcg (`actan'), the number of lru inactive file pages
1356+
for k8s global memcg (`infl'), the number of lru active file pages
1357+
for k8s global memcg (`actfl'), the number of current usage file
1358+
for k8s global memcg, including usermem and kmem (`usage'), the
1359+
number of workingset file pages (from k8s vision: number of current
1360+
usage pages minus inactivefile pages) for k8s global memcg (`wkset').
1361+
.PP
1362+
.TP 5
13461363
.B PSI
13471364
Pressure Stall Information.
13481365
.br

photosyst.c

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,14 @@
8383
/* recognize numa node */
8484
#define NUMADIR "/sys/devices/system/node"
8585

86+
/* recognize k8s global memory.stat and memory current usage */
87+
#define K8S_MEMDIR_CGV1 "/sys/fs/cgroup/memory/kubepods"
88+
#define K8S_MEMDIR_CGV2 "/sys/fs/cgroup/kubepods"
89+
#define K8S_SYSTEMD_CM ".slice"
90+
#define K8S_MEM_STAT "/memory.stat"
91+
#define K8S_MEM_CGV1_USAGE "/memory.usage_in_bytes"
92+
#define K8S_MEM_CGV2_USAGE "/memory.current"
93+
8694
/* recognize LLC monitor data */
8795
#define LLCDIR "/sys/fs/resctrl/mon_data"
8896
#define L3SIZE "/sys/devices/system/cpu/cpu0/cache/index3/size"
@@ -874,6 +882,151 @@ photosyst(struct sstat *si)
874882
}
875883
}
876884

885+
if ( supportflags & CGROUPV2 )
886+
{
887+
if ( (fp = fopen(K8S_MEMDIR_CGV2 K8S_MEM_STAT, "r")) != NULL ||
888+
(fp = fopen(K8S_MEMDIR_CGV2 K8S_SYSTEMD_CM K8S_MEM_STAT, "r")) != NULL )
889+
{
890+
/* for cgroup v2 */
891+
while ( fgets(linebuf, sizeof(linebuf), fp) != NULL )
892+
{
893+
nr = sscanf(linebuf, "%s %lld\n", nam, &cnts[0]);
894+
895+
if ( strcmp("file", nam) == EQ )
896+
{
897+
si->k8smem.file = cnts[0]/pagesize;
898+
continue;
899+
}
900+
if ( strcmp("anon", nam) == EQ )
901+
{
902+
si->k8smem.anon = cnts[0]/pagesize;
903+
continue;
904+
}
905+
if ( strcmp("shmem", nam) == EQ )
906+
{
907+
si->k8smem.shmem = cnts[0]/pagesize;
908+
continue;
909+
}
910+
if ( strcmp("file_mapped", nam) == EQ )
911+
{
912+
si->k8smem.filemapped = cnts[0]/pagesize;
913+
continue;
914+
}
915+
if ( strcmp("inactive_anon", nam) == EQ )
916+
{
917+
si->k8smem.inactiveanon = cnts[0]/pagesize;
918+
continue;
919+
}
920+
if ( strcmp("active_anon", nam) == EQ )
921+
{
922+
si->k8smem.activeanon = cnts[0]/pagesize;
923+
continue;
924+
}
925+
if ( strcmp("inactive_file", nam) == EQ )
926+
{
927+
si->k8smem.inactivefile = cnts[0]/pagesize;
928+
continue;
929+
}
930+
if ( strcmp("active_file", nam) == EQ )
931+
{
932+
si->k8smem.activefile = cnts[0]/pagesize;
933+
continue;
934+
}
935+
}
936+
937+
fclose(fp);
938+
}
939+
940+
if ( (fp = fopen(K8S_MEMDIR_CGV2 K8S_MEM_CGV2_USAGE, "r")) != NULL ||
941+
(fp = fopen(K8S_MEMDIR_CGV2 K8S_SYSTEMD_CM K8S_MEM_CGV2_USAGE, "r")) != NULL )
942+
{
943+
if ( fscanf(fp, "%lld", &cnts[0]) == 1 )
944+
{
945+
/*
946+
** Refer to https://github.com/kubernetes/kubernetes/issues/43916,
947+
** memory.available := node.status.capacity[memory] - node.stats.memory.workingSet
948+
** && workingSet := $cgroupfs/memory.current - inactive_file
949+
*/
950+
si->k8smem.usagefile = cnts[0]/pagesize;
951+
si->k8smem.workingset = si->k8smem.usagefile - si->k8smem.inactivefile;
952+
}
953+
954+
fclose(fp);
955+
}
956+
}
957+
else
958+
{
959+
if ( (fp = fopen(K8S_MEMDIR_CGV1 K8S_MEM_STAT, "r")) != NULL ||
960+
(fp = fopen(K8S_MEMDIR_CGV1 K8S_SYSTEMD_CM K8S_MEM_STAT, "r")) != NULL )
961+
{
962+
/* for cgroup v1 */
963+
while ( fgets(linebuf, sizeof(linebuf), fp) != NULL )
964+
{
965+
nr = sscanf(linebuf, "%s %lld\n", nam, &cnts[0]);
966+
967+
if ( strcmp("total_cache", nam) == EQ )
968+
{
969+
si->k8smem.file = cnts[0]/pagesize;
970+
continue;
971+
}
972+
if ( strcmp("total_rss", nam) == EQ)
973+
{
974+
si->k8smem.anon = cnts[0]/pagesize;
975+
continue;
976+
}
977+
if ( strcmp("total_shmem", nam) == EQ)
978+
{
979+
si->k8smem.shmem = cnts[0]/pagesize;
980+
continue;
981+
}
982+
if ( strcmp("total_mapped_file", nam) == EQ)
983+
{
984+
si->k8smem.filemapped = cnts[0]/pagesize;
985+
continue;
986+
}
987+
if ( strcmp("total_inactive_anon", nam) == EQ)
988+
{
989+
si->k8smem.inactiveanon = cnts[0]/pagesize;
990+
continue;
991+
}
992+
if ( strcmp("total_active_anon", nam) == EQ)
993+
{
994+
si->k8smem.activeanon = cnts[0]/pagesize;
995+
continue;
996+
}
997+
if ( strcmp("total_inactive_file", nam) == EQ)
998+
{
999+
si->k8smem.inactivefile = cnts[0]/pagesize;
1000+
continue;
1001+
}
1002+
if ( strcmp("total_active_file", nam) == EQ)
1003+
{
1004+
si->k8smem.activefile = cnts[0]/pagesize;
1005+
continue;
1006+
}
1007+
}
1008+
1009+
fclose(fp);
1010+
}
1011+
1012+
if ( (fp = fopen(K8S_MEMDIR_CGV1 K8S_MEM_CGV1_USAGE, "r")) != NULL ||
1013+
(fp = fopen(K8S_MEMDIR_CGV1 K8S_SYSTEMD_CM K8S_MEM_CGV1_USAGE, "r")) != NULL )
1014+
{
1015+
if ( fscanf(fp, "%lld", &cnts[0]) == 1 )
1016+
{
1017+
/*
1018+
** Refer to https://github.com/kubernetes/kubernetes/issues/43916,
1019+
** memory.available := node.status.capacity[memory] - node.stats.memory.workingSet
1020+
** && workingSet := $cgroupfs/memory.usage_in_bytes - total_inactive_file
1021+
*/
1022+
si->k8smem.usagefile = cnts[0]/pagesize;
1023+
si->k8smem.workingset = si->k8smem.usagefile - si->k8smem.inactivefile;
1024+
}
1025+
1026+
fclose(fp);
1027+
}
1028+
}
1029+
8771030
/*
8781031
** gather per numa memory-related statistics from the file
8791032
** /sys/devices/system/node/node0/meminfo, and store them in binary form.

photosyst.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,19 @@ struct llcstat {
423423
struct perllc perllc[MAXLLC];
424424
};
425425

426+
struct k8smem {
427+
count_t file; /* number of file pages for k8s global memcg */
428+
count_t anon; /* number of mapped anonymous pages for k8s global memcg */
429+
count_t shmem; /* number of shmem pages (included tmpfs/GEM pages) for k8s global memcg */
430+
count_t filemapped; /* number of pagecache pages mapped into pagetables for k8s global memcg */
431+
count_t inactiveanon; /* number of lru inactive anon pages for k8s global memcg */
432+
count_t activeanon; /* number of lru active anon pages for k8s global memcg */
433+
count_t inactivefile; /* number of lru inactive file pages for k8s global memcg */
434+
count_t activefile; /* number of lru active file pages for k8s global memcg */
435+
count_t usagefile; /* number of current usage pages for k8s global memcg */
436+
count_t workingset; /* k8s vision: number of current usage pages minus inactivefile pages */
437+
};
438+
426439
/************************************************************************/
427440

428441
struct sstat {
@@ -439,6 +452,7 @@ struct sstat {
439452
struct gpustat gpu;
440453
struct ifbstat ifb;
441454
struct llcstat llc;
455+
struct k8smem k8smem;
442456

443457
struct wwwstat www;
444458
};

showlinux.c

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,19 @@ sys_printdef *llcsyspdefs[] = {
244244
&syspdef_BLANKBOX,
245245
0
246246
};
247+
sys_printdef *k8smemsyspdefs[] = {
248+
&syspdef_K8SFILE,
249+
&syspdef_K8SANON,
250+
&syspdef_K8SSHMEM,
251+
&syspdef_K8SFILEMAPPED,
252+
&syspdef_K8SINACTIVEANON,
253+
&syspdef_K8SACTIVEANON,
254+
&syspdef_K8SINACTIVEFILE,
255+
&syspdef_K8SACTIVEFILE,
256+
&syspdef_K8SUSAGEFILE,
257+
&syspdef_K8SWORKINGSET,
258+
0
259+
};
247260
sys_printdef *psisyspdefs[] = {
248261
&syspdef_PSICPUSTOT,
249262
&syspdef_PSIMEMSTOT,
@@ -520,6 +533,7 @@ sys_printpair swpline[MAXITEMS];
520533
sys_printpair memnumaline[MAXITEMS];
521534
sys_printpair cpunumaline[MAXITEMS];
522535
sys_printpair llcline[MAXITEMS];
536+
sys_printpair k8smemline[MAXITEMS];
523537
sys_printpair pagline[MAXITEMS];
524538
sys_printpair psiline[MAXITEMS];
525539
sys_printpair contline[MAXITEMS];
@@ -1027,6 +1041,23 @@ pricumproc(struct sstat *sstat, struct devtstat *devtstat,
10271041
sstat, &extra);
10281042
}
10291043

1044+
if (k8smemline[0].f == 0)
1045+
{
1046+
make_sys_prints(k8smemline, MAXITEMS,
1047+
"K8SFILE:1 "
1048+
"K8SANON:1 "
1049+
"K8SSHMEM:1 "
1050+
"K8SFILEMAPPED:2 "
1051+
"K8SACTIVEANON:2 "
1052+
"K8SINACTIVEANON:2 "
1053+
"K8SACTIVEFILE:2 "
1054+
"K8SINACTIVEFILE:1 "
1055+
"K8SUSAGEFILE:1 "
1056+
"K8SWORKINGSET:1 ",
1057+
k8smemsyspdefs, "builtin k8smemline",
1058+
sstat, &extra);
1059+
}
1060+
10301061
if (pagline[0].f == 0)
10311062
{
10321063
make_sys_prints(pagline, MAXITEMS,
@@ -2010,6 +2041,28 @@ prisyst(struct sstat *sstat, int curline, int nsecs, int avgval,
20102041
}
20112042
}
20122043

2044+
/*
2045+
** k8s global memory.stat statistics
2046+
*/
2047+
if (fixedhead ||
2048+
sstat->k8smem.file ||
2049+
sstat->k8smem.anon ||
2050+
sstat->k8smem.shmem ||
2051+
sstat->k8smem.filemapped ||
2052+
sstat->k8smem.inactiveanon ||
2053+
sstat->k8smem.activeanon ||
2054+
sstat->k8smem.inactivefile ||
2055+
sstat->k8smem.activefile ||
2056+
sstat->k8smem.usagefile ||
2057+
sstat->k8smem.workingset )
2058+
{
2059+
if (screen)
2060+
move(curline, 0);
2061+
2062+
showsysline(k8smemline, sstat, &extra, "K8S", 0);
2063+
curline++;
2064+
}
2065+
20132066
/*
20142067
** PAGING statistics
20152068
*/
@@ -2978,6 +3031,13 @@ do_ownllcline(char *name, char *val)
29783031
NULL, NULL);
29793032
}
29803033

3034+
void
3035+
do_ownk8smemline(char *name, char *val)
3036+
{
3037+
make_sys_prints(k8smemline, MAXITEMS, val, k8smemsyspdefs, name,
3038+
NULL, NULL);
3039+
}
3040+
29813041
void
29823042
do_owndskline(char *name, char *val)
29833043
{

showlinux.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ void do_ownpagline(char *, char *);
144144
void do_ownmemnumaline(char *, char *);
145145
void do_owncpunumaline(char *, char *);
146146
void do_ownllcline(char *, char *);
147+
void do_ownk8smemline(char *, char *);
147148
void do_owndskline(char *, char *);
148149
void do_ownnettransportline(char *, char *);
149150
void do_ownnetnetline(char *, char *);
@@ -266,6 +267,16 @@ extern sys_printdef syspdef_NUMACPUGUEST;
266267
extern sys_printdef syspdef_LLCMBMTOTAL;
267268
extern sys_printdef syspdef_LLCMBMLOCAL;
268269
extern sys_printdef syspdef_NUMLLC;
270+
extern sys_printdef syspdef_K8SFILE;
271+
extern sys_printdef syspdef_K8SANON;
272+
extern sys_printdef syspdef_K8SSHMEM;
273+
extern sys_printdef syspdef_K8SFILEMAPPED;
274+
extern sys_printdef syspdef_K8SACTIVEANON;
275+
extern sys_printdef syspdef_K8SINACTIVEANON;
276+
extern sys_printdef syspdef_K8SACTIVEFILE;
277+
extern sys_printdef syspdef_K8SINACTIVEFILE;
278+
extern sys_printdef syspdef_K8SUSAGEFILE;
279+
extern sys_printdef syspdef_K8SWORKINGSET;
269280
extern sys_printdef syspdef_PAGSCAN;
270281
extern sys_printdef syspdef_PAGSTEAL;
271282
extern sys_printdef syspdef_PAGSTALL;

0 commit comments

Comments
 (0)