From 21786e5cb375a1e58a9175fee423e1d7f892d965 Mon Sep 17 00:00:00 2001 From: Daniel Vacek Date: Tue, 20 Dec 2022 16:14:15 +0100 Subject: [PATCH 01/11] cgroup/cpuset: no need to explicitly init a global static variable cpuset_rwsem is a static variable defined with DEFINE_STATIC_PERCPU_RWSEM(). It's initialized at build time and so there's no need for explicit runtime init leaking one percpu int. Signed-off-by: Daniel Vacek Reviewed-by: Aaron Tomlin Acked-by: Mukesh Ojha Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a29c0b13706bb..87fe410361b3d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3281,8 +3281,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = { int __init cpuset_init(void) { - BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); - BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); From 56eb276701cb67aa6c52e0713e66b46b4d94b38f Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Thu, 5 Jan 2023 20:16:26 +0700 Subject: [PATCH 02/11] docs: cgroup-v1: replace custom note constructs with appropriate admonition blocks Admonition constructs on the documentation use definition lists, which isn't fit for the purpose. Replace them with appropriate blocks: * Use caution:: for outdated document notice * hint:: for memo * note:: for other constructs * warning:: for memory reclaim Signed-off-by: Bagas Sanjaya Signed-off-by: Tejun Heo --- .../admin-guide/cgroup-v1/memory.rst | 58 ++++++++++--------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 60370f2c67b99..d0d8c780cb6bd 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -2,13 +2,13 @@ Memory Resource Controller ========================== -NOTE: +.. caution:: This document is hopelessly outdated and it asks for a complete rewrite. It still contains a useful information so we are keeping it here but make sure to check the current code if you need a deeper understanding. -NOTE: +.. note:: The Memory Resource Controller has generically been referred to as the memory controller in this document. Do not confuse memory controller used here with the memory controller that is used in hardware. @@ -274,12 +274,12 @@ The reclaim algorithm has not been modified for cgroups, except that pages that are selected for reclaiming come from the per-cgroup LRU list. -NOTE: - Reclaim does not work for the root cgroup, since we cannot set any - limits on the root cgroup. +.. note:: + Reclaim does not work for the root cgroup, since we cannot set any + limits on the root cgroup. -Note2: - When panic_on_oom is set to "2", the whole system will panic. +.. note:: + When panic_on_oom is set to "2", the whole system will panic. When oom event notifier is registered, event will be delivered. (See oom_control section) @@ -367,10 +367,10 @@ U != 0, K < U: never greater than the total memory, and freely set U at the cost of his QoS. -WARNING: - In the current implementation, memory reclaim will NOT be - triggered for a cgroup when it hits K while staying below U, which makes - this setup impractical. + .. warning:: + In the current implementation, memory reclaim will NOT be triggered for + a cgroup when it hits K while staying below U, which makes this setup + impractical. U != 0, K >= U: Since kmem charges will also be fed to the user counter and reclaim will be @@ -405,16 +405,16 @@ Since now we're in the 0 cgroup, we can alter the memory limit:: # echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes -NOTE: - We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, - mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, - Gibibytes.) +.. note:: + We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, + mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, + Gibibytes.) -NOTE: - We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``. +.. note:: + We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``. -NOTE: - We cannot set limits on the root cgroup any more. +.. note:: + We cannot set limits on the root cgroup any more. :: @@ -574,12 +574,12 @@ recent_scanned_anon VM internal parameter. (see mm/vmscan.c) recent_scanned_file VM internal parameter. (see mm/vmscan.c) ========================= ======================================== -Memo: +.. hint:: recent_rotated means recent frequency of LRU rotation. recent_scanned means recent # of scans to LRU. showing for better debug please see the code for meanings. -Note: +.. note:: Only anonymous and swap cache memory is listed as part of 'rss' stat. This should not be confused with the true 'resident set size' or the amount of physical memory used by the cgroup. @@ -710,10 +710,11 @@ If we want to change this to 1G, we can at any time use:: # echo 1G > memory.soft_limit_in_bytes -NOTE1: +.. note:: Soft limits take effect over a long period of time, since they involve reclaiming memory for balancing between memory cgroups -NOTE2: + +.. note:: It is recommended to set the soft limit always below the hard limit, otherwise the hard limit will take precedence. @@ -735,17 +736,20 @@ If you want to enable it:: # echo (some positive value) > memory.move_charge_at_immigrate -Note: +.. note:: Each bits of move_charge_at_immigrate has its own meaning about what type of charges should be moved. See 8.2 for details. -Note: + +.. note:: Charges are moved only when you move mm->owner, in other words, a leader of a thread group. -Note: + +.. note:: If we cannot find enough space for the task in the destination cgroup, we try to make space by reclaiming memory. Task migration may fail if we cannot make enough space. -Note: + +.. note:: It can take several seconds if you move charges much. And if you want disable it again:: From 4ddb1a2aa1a3c4317b94417f1bace0e4f06f51b9 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Thu, 5 Jan 2023 20:16:27 +0700 Subject: [PATCH 03/11] docs: cgroup-v1: wrap remaining admonitions in admonition blocks Wrap two other admonitions in appropriate blocks in order for readers to pay more attention to block contents: * hint:: for editor's note * warning:: for move charges deprecation Signed-off-by: Bagas Sanjaya Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v1/memory.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index d0d8c780cb6bd..b0353c40639b7 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -13,7 +13,7 @@ Memory Resource Controller memory controller in this document. Do not confuse memory controller used here with the memory controller that is used in hardware. -(For editors) In this document: +.. hint:: When we mention a cgroup (cgroupfs's directory) with memory controller, we call it "memory cgroup". When you see git-log and source code, you'll see patch's title and function names tend to use "memcg". From eb08489448fbc70eda1ba6cfdddd59de00ddf941 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Thu, 5 Jan 2023 20:16:28 +0700 Subject: [PATCH 04/11] docs: cgroup-v1: use code block for locking order schema The locking order schema is a figure (like diagram), which should have been formatted with literal code block for consistency with other figures. Signed-off-by: Bagas Sanjaya Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v1/memory.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index b0353c40639b7..2b807fc128c66 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -287,7 +287,7 @@ When oom event notifier is registered, event will be delivered. 2.6 Locking ----------- -Lock order is as follows: +Lock order is as follows:: Page lock (PG_locked bit of page->flags) mm->page_table_lock or split pte_lock From 71da431c30795716a1ca26158f608781b3eba33d Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Thu, 5 Jan 2023 20:16:29 +0700 Subject: [PATCH 05/11] docs: cgroup-v1: fix footnotes The documentation contains external references, which some of them are marked as footnotes. Fix the syntax for them to be properly rendered as such. Non-footnote references aren't affected since the text for these is aligned the same to the footnotes. Signed-off-by: Bagas Sanjaya Signed-off-by: Tejun Heo --- .../admin-guide/cgroup-v1/memory.rst | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 2b807fc128c66..8d1cedcf44f6e 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -23,7 +23,7 @@ Benefits and Purpose of the memory controller ============================================= The memory controller isolates the memory behaviour of a group of tasks -from the rest of the system. The article on LWN [12] mentions some probable +from the rest of the system. The article on LWN [12]_ mentions some probable uses of the memory controller. The memory controller can be used to a. Isolate an application or a group of applications @@ -107,16 +107,16 @@ Brief summary of control files. ========== The memory controller has a long history. A request for comments for the memory -controller was posted by Balbir Singh [1]. At the time the RFC was posted +controller was posted by Balbir Singh [1]_. At the time the RFC was posted there were several implementations for memory control. The goal of the RFC was to build consensus and agreement for the minimal features required -for memory control. The first RSS controller was posted by Balbir Singh[2] -in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the -RSS controller. At OLS, at the resource management BoF, everyone suggested -that we handle both page cache and RSS together. Another request was raised -to allow user space handling of OOM. The current memory controller is +for memory control. The first RSS controller was posted by Balbir Singh [2]_ +in Feb 2007. Pavel Emelianov [3]_ [4]_ [5]_ has since posted three versions +of the RSS controller. At OLS, at the resource management BoF, everyone +suggested that we handle both page cache and RSS together. Another request was +raised to allow user space handling of OOM. The current memory controller is at version 6; it combines both mapped (RSS) and unmapped Page -Cache Control [11]. +Cache Control [11]_. 2. Memory Control ================= @@ -960,15 +960,16 @@ commented and discussed quite extensively in the community. References ========== -1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ -2. Singh, Balbir. Memory Controller (RSS Control), +.. [1] Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ +.. [2] Singh, Balbir. Memory Controller (RSS Control), http://lwn.net/Articles/222762/ -3. Emelianov, Pavel. Resource controllers based on process cgroups +.. [3] Emelianov, Pavel. Resource controllers based on process cgroups https://lore.kernel.org/r/45ED7DEC.7010403@sw.ru -4. Emelianov, Pavel. RSS controller based on process cgroups (v2) +.. [4] Emelianov, Pavel. RSS controller based on process cgroups (v2) https://lore.kernel.org/r/461A3010.90403@sw.ru -5. Emelianov, Pavel. RSS controller based on process cgroups (v3) +.. [5] Emelianov, Pavel. RSS controller based on process cgroups (v3) https://lore.kernel.org/r/465D9739.8070209@openvz.org + 6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ 7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control subsystem (v3), http://lwn.net/Articles/235534/ @@ -978,7 +979,8 @@ References https://lore.kernel.org/r/464D267A.50107@linux.vnet.ibm.com 10. Singh, Balbir. Memory controller v6 test results, https://lore.kernel.org/r/20070819094658.654.84837.sendpatchset@balbir-laptop -11. Singh, Balbir. Memory controller introduction (v6), - https://lore.kernel.org/r/20070817084228.26003.12568.sendpatchset@balbir-laptop -12. Corbet, Jonathan, Controlling memory use in cgroups, - http://lwn.net/Articles/243795/ + +.. [11] Singh, Balbir. Memory controller introduction (v6), + https://lore.kernel.org/r/20070817084228.26003.12568.sendpatchset@balbir-laptop +.. [12] Corbet, Jonathan, Controlling memory use in cgroups, + http://lwn.net/Articles/243795/ From f7423bb771d4acaad4e40addf9e09f5722f48b04 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Thu, 5 Jan 2023 20:16:30 +0700 Subject: [PATCH 06/11] docs: cgroup-v1: move hierarchy of accounting caption The caption for hierarchy of accounting figure is in the code block, which is quite odd. Move the caption into :caption: option of code-block:: directive instead. Signed-off-by: Bagas Sanjaya Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v1/memory.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 8d1cedcf44f6e..4d96a5bbbfcfd 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -147,7 +147,8 @@ specific data structure (mem_cgroup) associated with it. 2.2. Accounting --------------- -:: +.. code-block:: + :caption: Figure 1: Hierarchy of Accounting +--------------------+ | mem_cgroup | @@ -167,7 +168,6 @@ specific data structure (mem_cgroup) associated with it. | | | | +---------------+ +---------------+ - (Figure 1: Hierarchy of Accounting) Figure 1 shows the important aspects of the controller From b9d2a17b3290e251748c6a9f6d2b0a440401096c Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Thu, 5 Jan 2023 20:16:31 +0700 Subject: [PATCH 07/11] docs: cgroup-v1: use bullet lists for list of stat file tables The stat file section contains three tables, where the leading texts for them are subsection heading. Organize them in the bullet list, while demoting headings into normal text. Signed-off-by: Bagas Sanjaya Signed-off-by: Tejun Heo --- .../admin-guide/cgroup-v1/memory.rst | 107 +++++++++--------- 1 file changed, 53 insertions(+), 54 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 4d96a5bbbfcfd..162cc26dcddb2 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -519,60 +519,59 @@ will be charged as a new owner of it. 5.2 stat file ------------- -memory.stat file includes following statistics - -per-memory cgroup local status -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -=============== =============================================================== -cache # of bytes of page cache memory. -rss # of bytes of anonymous and swap cache memory (includes - transparent hugepages). -rss_huge # of bytes of anonymous transparent hugepages. -mapped_file # of bytes of mapped file (includes tmpfs/shmem) -pgpgin # of charging events to the memory cgroup. The charging - event happens each time a page is accounted as either mapped - anon page(RSS) or cache page(Page Cache) to the cgroup. -pgpgout # of uncharging events to the memory cgroup. The uncharging - event happens each time a page is unaccounted from the cgroup. -swap # of bytes of swap usage -dirty # of bytes that are waiting to get written back to the disk. -writeback # of bytes of file/anon cache that are queued for syncing to - disk. -inactive_anon # of bytes of anonymous and swap cache memory on inactive - LRU list. -active_anon # of bytes of anonymous and swap cache memory on active - LRU list. -inactive_file # of bytes of file-backed memory and MADV_FREE anonymous memory( - LazyFree pages) on inactive LRU list. -active_file # of bytes of file-backed memory on active LRU list. -unevictable # of bytes of memory that cannot be reclaimed (mlocked etc). -=============== =============================================================== - -status considering hierarchy (see memory.use_hierarchy settings) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -========================= =================================================== -hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy - under which the memory cgroup is -hierarchical_memsw_limit # of bytes of memory+swap limit with regard to - hierarchy under which memory cgroup is. - -total_ # hierarchical version of , which in - addition to the cgroup's own value includes the - sum of all hierarchical children's values of - , i.e. total_cache -========================= =================================================== - -The following additional stats are dependent on CONFIG_DEBUG_VM -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -========================= ======================================== -recent_rotated_anon VM internal parameter. (see mm/vmscan.c) -recent_rotated_file VM internal parameter. (see mm/vmscan.c) -recent_scanned_anon VM internal parameter. (see mm/vmscan.c) -recent_scanned_file VM internal parameter. (see mm/vmscan.c) -========================= ======================================== +memory.stat file includes following statistics: + + * per-memory cgroup local status + + =============== =============================================================== + cache # of bytes of page cache memory. + rss # of bytes of anonymous and swap cache memory (includes + transparent hugepages). + rss_huge # of bytes of anonymous transparent hugepages. + mapped_file # of bytes of mapped file (includes tmpfs/shmem) + pgpgin # of charging events to the memory cgroup. The charging + event happens each time a page is accounted as either mapped + anon page(RSS) or cache page(Page Cache) to the cgroup. + pgpgout # of uncharging events to the memory cgroup. The uncharging + event happens each time a page is unaccounted from the + cgroup. + swap # of bytes of swap usage + dirty # of bytes that are waiting to get written back to the disk. + writeback # of bytes of file/anon cache that are queued for syncing to + disk. + inactive_anon # of bytes of anonymous and swap cache memory on inactive + LRU list. + active_anon # of bytes of anonymous and swap cache memory on active + LRU list. + inactive_file # of bytes of file-backed memory and MADV_FREE anonymous + memory (LazyFree pages) on inactive LRU list. + active_file # of bytes of file-backed memory on active LRU list. + unevictable # of bytes of memory that cannot be reclaimed (mlocked etc). + =============== =============================================================== + + * status considering hierarchy (see memory.use_hierarchy settings): + + ========================= =================================================== + hierarchical_memory_limit # of bytes of memory limit with regard to + hierarchy + under which the memory cgroup is + hierarchical_memsw_limit # of bytes of memory+swap limit with regard to + hierarchy under which memory cgroup is. + + total_ # hierarchical version of , which in + addition to the cgroup's own value includes the + sum of all hierarchical children's values of + , i.e. total_cache + ========================= =================================================== + + * additional vm parameters (depends on CONFIG_DEBUG_VM): + + ========================= ======================================== + recent_rotated_anon VM internal parameter. (see mm/vmscan.c) + recent_rotated_file VM internal parameter. (see mm/vmscan.c) + recent_scanned_anon VM internal parameter. (see mm/vmscan.c) + recent_scanned_file VM internal parameter. (see mm/vmscan.c) + ========================= ======================================== .. hint:: recent_rotated means recent frequency of LRU rotation. From 5fa16afc4b5ab0f030fe9732bac431557112b0c6 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Thu, 5 Jan 2023 20:16:32 +0700 Subject: [PATCH 08/11] docs: cgroup-v1: make swap extension subsections subsections Subsections text of swap extension section is marked up as bold text, whereas making them proper subsection is more appropriate. Signed-off-by: Bagas Sanjaya Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v1/memory.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 162cc26dcddb2..0e583a6f78395 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -244,7 +244,8 @@ In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. By using the memsw limit, you can avoid system OOM which can be caused by swap shortage. -**why 'memory+swap' rather than swap** +2.4.1 why 'memory+swap' rather than swap +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The global LRU(kswapd) can swap out arbitrary pages. Swap-out means to move account from memory to swap...there is no change in usage of @@ -252,7 +253,8 @@ memory+swap. In other words, when we want to limit the usage of swap without affecting global LRU, memory+swap limit is better than just limiting swap from an OS point of view. -**What happens when a cgroup hits memory.memsw.limit_in_bytes** +2.4.2. What happens when a cgroup hits memory.memsw.limit_in_bytes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out in this cgroup. Then, swap-out will not be done by cgroup routine and file From da3ad2e14f6314a0246ae3c88afef59c8b49a3e5 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Thu, 5 Jan 2023 20:16:33 +0700 Subject: [PATCH 09/11] docs: cgroup-v1: add internal cross-references The documentation contains references to other sections in the doc (internal). Add cross-references for them so that these can be accessed without having to manually search for them. Signed-off-by: Bagas Sanjaya Signed-off-by: Tejun Heo --- .../admin-guide/cgroup-v1/memory.rst | 43 +++++++++++++------ 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 0e583a6f78395..16d938abe69f8 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -55,7 +55,8 @@ Features: - Root cgroup has no limit controls. Kernel memory support is a work in progress, and the current version provides - basically functionality. (See Section 2.7) + basically functionality. (See :ref:`section 2.7 + `) Brief summary of control files. @@ -221,8 +222,9 @@ behind this approach is that a cgroup that aggressively uses a shared page will eventually get charged for it (once it is uncharged from the cgroup that brought it in -- this will happen on memory pressure). -But see section 8.2: when moving a task to another cgroup, its pages may -be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. +But see :ref:`section 8.2 ` when moving a +task to another cgroup, its pages may be recharged to the new cgroup, if +move_charge_at_immigrate has been chosen. 2.4 Swap Extension -------------------------------------- @@ -270,7 +272,7 @@ global VM. When a cgroup goes over its limit, we first try to reclaim memory from the cgroup so as to make space for the new pages that the cgroup has touched. If the reclaim is unsuccessful, an OOM routine is invoked to select and kill the bulkiest task in the -cgroup. (See 10. OOM Control below.) +cgroup. (See :ref:`10. OOM Control ` below.) The reclaim algorithm has not been modified for cgroups, except that pages that are selected for reclaiming come from the per-cgroup LRU @@ -284,7 +286,7 @@ list. When panic_on_oom is set to "2", the whole system will panic. When oom event notifier is registered, event will be delivered. -(See oom_control section) +(See :ref:`oom_control ` section) 2.6 Locking ----------- @@ -301,6 +303,8 @@ Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by lruvec->lru_lock; PG_lru bit of page->flags is cleared before isolating a page from its LRU under lruvec->lru_lock. +.. _cgroup-v1-memory-kernel-extension: + 2.7 Kernel Memory Extension ----------------------------------------------- @@ -460,6 +464,8 @@ test because it has noise of shared objects/status. But the above two are testing extreme situations. Trying usual test under memory controller is always helpful. +.. _cgroup-v1-memory-test-troubleshoot: + 4.1 Troubleshooting ------------------- @@ -472,8 +478,11 @@ terminated by the OOM killer. There are several causes for this: A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of some of the pages cached in the cgroup (page cache pages). -To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and -seeing what happens will be helpful. +To know what happens, disabling OOM_Kill as per :ref:`"10. OOM Control" +` (below) and seeing what happens will be +helpful. + +.. _cgroup-v1-memory-test-task-migration: 4.2 Task migration ------------------ @@ -484,15 +493,16 @@ remain charged to it, the charge is dropped when the page is freed or reclaimed. You can move charges of a task along with task migration. -See 8. "Move charges at task migration" +See :ref:`8. "Move charges at task migration" ` 4.3 Removing a cgroup --------------------- -A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a -cgroup might have some charge associated with it, even though all -tasks have migrated away from it. (because we charge against pages, not -against tasks.) +A cgroup can be removed by rmdir, but as discussed in :ref:`sections 4.1 +` and :ref:`4.2 +`, a cgroup might have some charge +associated with it, even though all tasks have migrated away from it. (because +we charge against pages, not against tasks.) We move the stats to parent, and no change on the charge except uncharging from the child. @@ -719,6 +729,8 @@ If we want to change this to 1G, we can at any time use:: It is recommended to set the soft limit always below the hard limit, otherwise the hard limit will take precedence. +.. _cgroup-v1-memory-move-charges: + 8. Move charges at task migration ================================= @@ -739,7 +751,8 @@ If you want to enable it:: .. note:: Each bits of move_charge_at_immigrate has its own meaning about what type - of charges should be moved. See 8.2 for details. + of charges should be moved. See :ref:`section 8.2 + ` for details. .. note:: Charges are moved only when you move mm->owner, in other words, @@ -757,6 +770,8 @@ And if you want disable it again:: # echo 0 > memory.move_charge_at_immigrate +.. _cgroup-v1-memory-movable-charges: + 8.2 Type of charges which can be moved -------------------------------------- @@ -806,6 +821,8 @@ threshold in any direction. It's applicable for root and non-root cgroup. +.. _cgroup-v1-memory-oom-control: + 10. OOM Control =============== From 980660cae7994ab03b31b2a32940c70e8421fc99 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Thu, 5 Jan 2023 20:16:34 +0700 Subject: [PATCH 10/11] docs: cgroup-v1: use numbered lists for user interface setup Setup instructions for memory resource controller UI uses a mix of section headings and normal paragraphs, whereas numbered lists are better fit for this purpose. While at it, also slightly reword the instructions and add reference to "Why are cgroups needed?" in the main cgroups documentation. Signed-off-by: Bagas Sanjaya Signed-off-by: Tejun Heo --- .../admin-guide/cgroup-v1/cgroups.rst | 2 ++ .../admin-guide/cgroup-v1/memory.rst | 26 ++++++++----------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/cgroups.rst b/Documentation/admin-guide/cgroup-v1/cgroups.rst index b0688011ed06d..9343148ee9936 100644 --- a/Documentation/admin-guide/cgroup-v1/cgroups.rst +++ b/Documentation/admin-guide/cgroup-v1/cgroups.rst @@ -80,6 +80,8 @@ access. For example, cpusets (see Documentation/admin-guide/cgroup-v1/cpusets.rs you to associate a set of CPUs and a set of memory nodes with the tasks in each cgroup. +.. _cgroups-why-needed: + 1.2 Why are cgroups needed ? ---------------------------- diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 16d938abe69f8..27d89495ac880 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -387,30 +387,30 @@ U != 0, K >= U: 3. User Interface ================= -3.0. Configuration ------------------- - -a. Enable CONFIG_CGROUPS -b. Enable CONFIG_MEMCG - -3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) -------------------------------------------------------------------- +To use the user interface: -:: +1. Enable CONFIG_CGROUPS and CONFIG_MEMCG options +2. Prepare the cgroups (see :ref:`Why are cgroups needed? + ` for the background information):: # mount -t tmpfs none /sys/fs/cgroup # mkdir /sys/fs/cgroup/memory # mount -t cgroup none /sys/fs/cgroup/memory -o memory -3.2. Make the new group and move bash into it:: +3. Make the new group and move bash into it:: # mkdir /sys/fs/cgroup/memory/0 # echo $$ > /sys/fs/cgroup/memory/0/tasks -Since now we're in the 0 cgroup, we can alter the memory limit:: +4. Since now we're in the 0 cgroup, we can alter the memory limit:: # echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes + The limit can now be queried:: + + # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes + 4194304 + .. note:: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, @@ -422,10 +422,6 @@ Since now we're in the 0 cgroup, we can alter the memory limit:: .. note:: We cannot set limits on the root cgroup any more. -:: - - # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes - 4194304 We can check the usage:: From 32a47817d07557ffca9992964c514fd79bda6fba Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 7 Jan 2023 18:12:17 -0800 Subject: [PATCH 11/11] cgroup/cpuset: fix a few kernel-doc warnings & coding style Fix kernel-doc notation warnings: kernel/cgroup/cpuset.c:1309: warning: Excess function parameter 'cpuset' description in 'update_parent_subparts_cpumask' kernel/cgroup/cpuset.c:3909: warning: expecting prototype for cpuset_mem_spread_node(). Prototype was for cpuset_spread_node() instead Also drop a blank line before EXPORT_SYMBOL_GPL() to be consistent with kernel coding style. Signed-off-by: Randy Dunlap Cc: Waiman Long Cc: Zefan Li Cc: Tejun Heo Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 87fe410361b3d..a585ced99e1ef 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1267,7 +1267,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); /** * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset - * @cpuset: The cpuset that requests change in partition root state + * @cs: The cpuset that requests change in partition root state * @cmd: Partition root state change command * @newmask: Optional new cpumask for partcmd_update * @tmp: Temporary addmask and delmask @@ -3877,8 +3877,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) } /** - * cpuset_mem_spread_node() - On which node to begin search for a file page - * cpuset_slab_spread_node() - On which node to begin search for a slab page + * cpuset_spread_node() - On which node to begin search for a page * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), @@ -3902,12 +3901,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) * is passed an offline node, it will fall back to the local node. * See kmem_cache_alloc_node(). */ - static int cpuset_spread_node(int *rotor) { return *rotor = next_node_in(*rotor, current->mems_allowed); } +/** + * cpuset_mem_spread_node() - On which node to begin search for a file page + */ int cpuset_mem_spread_node(void) { if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) @@ -3917,6 +3918,9 @@ int cpuset_mem_spread_node(void) return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); } +/** + * cpuset_slab_spread_node() - On which node to begin search for a slab page + */ int cpuset_slab_spread_node(void) { if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) @@ -3925,7 +3929,6 @@ int cpuset_slab_spread_node(void) return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); } - EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); /**