From 7347586f007f7bc62bbbd1cba9d9351e4f79ebb0 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Fri, 31 Jan 2025 14:10:57 +0100 Subject: [PATCH 01/22] csky: Remove the size from alignment_tbl declaration Having to synchronize the number of ctl_table array elements with the size in the declaration can lead to discrepancies between the two values. Since commit d7a76ec87195 ("sysctl: Remove check for sentinel element in ctl_table arrays"), the calculation of the ctl_table array size is done solely by the ARRAY_SIZE macro removing the need for the size in the declaration. Remove the size for the aligment_tbl declaration and const qualify the array for good measure. Signed-off-by: Joel Granados Reviewed-by: Kees Cook --- arch/csky/abiv1/alignment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/csky/abiv1/alignment.c b/arch/csky/abiv1/alignment.c index e5b8b4b2109ac..aee904833dece 100644 --- a/arch/csky/abiv1/alignment.c +++ b/arch/csky/abiv1/alignment.c @@ -300,7 +300,7 @@ void csky_alignment(struct pt_regs *regs) force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr); } -static struct ctl_table alignment_tbl[5] = { +static const struct ctl_table alignment_tbl[] = { { .procname = "kernel_enable", .data = &align_kern_enable, From b8974b89603c601f64b302d80129b0faf4879199 Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:36 +0800 Subject: [PATCH 02/22] mm: vmstat: move sysctls to mm/vmstat.c This moves all vmstat related sysctls to its own file, removes useless extern variable declarations, and do some related clean-ups. To avoid compiler warnings when CONFIG_PROC_FS is not defined, add the macro definition CONFIG_PROC_FS ahead CONFIG_NUMA in vmstat.c. Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- include/linux/vmstat.h | 11 ----------- kernel/sysctl.c | 28 --------------------------- mm/vmstat.c | 44 ++++++++++++++++++++++++++++++++++++++---- 3 files changed, 40 insertions(+), 43 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 9f3a04345b860..4751e3ecc467d 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -10,15 +10,8 @@ #include #include -extern int sysctl_stat_interval; - #ifdef CONFIG_NUMA -#define ENABLE_NUMA_STAT 1 -#define DISABLE_NUMA_STAT 0 -extern int sysctl_vm_numa_stat; DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); -int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos); #endif struct reclaim_stat { @@ -304,10 +297,6 @@ void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); -struct ctl_table; -int vmstat_refresh(const struct ctl_table *, int write, void *buffer, size_t *lenp, - loff_t *ppos); - void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *); int calculate_pressure_threshold(struct zone *zone); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cb57da499ebb1..5e6ea570cfea9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include #include @@ -2071,17 +2070,6 @@ static const struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO_HUNDRED, }, -#ifdef CONFIG_NUMA - { - .procname = "numa_stat", - .data = &sysctl_vm_numa_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif { .procname = "drop_caches", .data = &sysctl_drop_caches, @@ -2147,22 +2135,6 @@ static const struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif -#ifdef CONFIG_SMP - { - .procname = "stat_interval", - .data = &sysctl_stat_interval, - .maxlen = sizeof(sysctl_stat_interval), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "stat_refresh", - .data = NULL, - .maxlen = 0, - .mode = 0600, - .proc_handler = vmstat_refresh, - }, -#endif #ifdef CONFIG_MMU { .procname = "mmap_min_addr", diff --git a/mm/vmstat.c b/mm/vmstat.c index 16bfe1c694dd4..39704b5b17e02 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -31,8 +31,10 @@ #include "internal.h" +#ifdef CONFIG_PROC_FS #ifdef CONFIG_NUMA -int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; +#define ENABLE_NUMA_STAT 1 +static int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; /* zero numa counters within a zone */ static void zero_zone_numa_counters(struct zone *zone) @@ -74,7 +76,7 @@ static void invalid_numa_statistics(void) static DEFINE_MUTEX(vm_numa_stat_lock); -int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write, +static int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; @@ -102,6 +104,7 @@ int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write, return ret; } #endif +#endif /* CONFIG_PROC_FS */ #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -1938,7 +1941,7 @@ static const struct seq_operations vmstat_op = { #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct delayed_work, vmstat_work); -int sysctl_stat_interval __read_mostly = HZ; +static int sysctl_stat_interval __read_mostly = HZ; static int vmstat_late_init_done; #ifdef CONFIG_PROC_FS @@ -1947,7 +1950,7 @@ static void refresh_vm_stats(struct work_struct *work) refresh_cpu_vm_stats(true); } -int vmstat_refresh(const struct ctl_table *table, int write, +static int vmstat_refresh(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { long val; @@ -2196,6 +2199,38 @@ static int __init vmstat_late_init(void) late_initcall(vmstat_late_init); #endif +#ifdef CONFIG_PROC_FS +static const struct ctl_table vmstat_table[] = { +#ifdef CONFIG_SMP + { + .procname = "stat_interval", + .data = &sysctl_stat_interval, + .maxlen = sizeof(sysctl_stat_interval), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +}; +#endif + struct workqueue_struct *mm_percpu_wq; void __init init_mm_internals(void) @@ -2227,6 +2262,7 @@ void __init init_mm_internals(void) proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); proc_create_seq("vmstat", 0444, NULL, &vmstat_op); proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); + register_sysctl_init("vm", vmstat_table); #endif } From 73aa354af21d4c09b31e9b287f4cb32a9b4e9c8c Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:37 +0800 Subject: [PATCH 03/22] mm: filemap: move sysctl to mm/filemap.c This moves the filemap related sysctl to mm/filemap.c, and removes the redundant external variable declaration. Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- include/linux/mm.h | 2 -- kernel/sysctl.c | 8 -------- mm/filemap.c | 18 +++++++++++++++--- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb70..70cca450a68a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -40,8 +40,6 @@ struct user_struct; struct pt_regs; struct folio_batch; -extern int sysctl_page_lock_unfairness; - void mm_core_init(void); void init_mm_internals(void); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5e6ea570cfea9..74a33552731d7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2079,14 +2079,6 @@ static const struct ctl_table vm_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_FOUR, }, - { - .procname = "page_lock_unfairness", - .data = &sysctl_page_lock_unfairness, - .maxlen = sizeof(sysctl_page_lock_unfairness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, #ifdef CONFIG_MMU { .procname = "max_map_count", diff --git a/mm/filemap.c b/mm/filemap.c index 804d7365680c1..004d78767804d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1078,6 +1079,19 @@ static wait_queue_head_t *folio_waitqueue(struct folio *folio) return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; } +/* How many times do we accept lock stealing from under a waiter? */ +static int sysctl_page_lock_unfairness = 5; +static const struct ctl_table filemap_sysctl_table[] = { + { + .procname = "page_lock_unfairness", + .data = &sysctl_page_lock_unfairness, + .maxlen = sizeof(sysctl_page_lock_unfairness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + } +}; + void __init pagecache_init(void) { int i; @@ -1086,6 +1100,7 @@ void __init pagecache_init(void) init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); + register_sysctl_init("vm", filemap_sysctl_table); } /* @@ -1233,9 +1248,6 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, return true; } -/* How many times do we accept lock stealing from under a waiter? */ -int sysctl_page_lock_unfairness = 5; - static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, int state, enum behavior behavior) { From 7e05627ee17bf30422705f94627cf9da70683887 Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:38 +0800 Subject: [PATCH 04/22] mm: swap: move sysctl to mm/swap.c The page-cluster belongs to mm/swap.c, move it to mm/swap.c . Removes the redundant external variable declaration and unneeded include(linux/swap.h). Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- include/linux/mm.h | 2 -- kernel/sysctl.c | 10 ---------- mm/swap.c | 16 +++++++++++++++- mm/swap.h | 1 + 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 70cca450a68a2..23cfaa40fef38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -76,8 +76,6 @@ static inline void totalram_pages_add(long count) } extern void * high_memory; -extern int page_cluster; -extern const int page_cluster_max; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74a33552731d7..b14e8764d45b3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -21,7 +21,6 @@ #include #include -#include #include #include #include @@ -2044,15 +2043,6 @@ static const struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = overcommit_kbytes_handler, }, - { - .procname = "page-cluster", - .data = &page_cluster, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&page_cluster_max, - }, { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, diff --git a/mm/swap.c b/mm/swap.c index fc8281ef42415..b81cce146eb29 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -45,7 +45,7 @@ /* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; -const int page_cluster_max = 31; +static const int page_cluster_max = 31; struct cpu_fbatches { /* @@ -1076,6 +1076,18 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) fbatch->nr = j; } +static const struct ctl_table swap_sysctl_table[] = { + { + .procname = "page-cluster", + .data = &page_cluster, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&page_cluster_max, + } +}; + /* * Perform any setup for the swap system */ @@ -1092,4 +1104,6 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ + + register_sysctl_init("vm", swap_sysctl_table); } diff --git a/mm/swap.h b/mm/swap.h index ad2f121de970d..274dcc6219a08 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -3,6 +3,7 @@ #define _MM_SWAP_H struct mempolicy; +extern int page_cluster; #ifdef CONFIG_SWAP #include /* for swp_offset */ From 538d5baacd8a01b814777af1c9f1f9740a714707 Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:39 +0800 Subject: [PATCH 05/22] mm: vmscan: move vmscan sysctls to mm/vmscan.c This moves vm_swappiness and zone_reclaim_mode to mm/vmscan.c, as part of the kernel/sysctl.c cleaning, also moves some external variable declarations and function declarations from include/linux/swap.h into mm/internal.h. Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- include/linux/swap.h | 9 --------- kernel/sysctl.c | 19 ------------------- mm/internal.h | 10 ++++++++++ mm/vmscan.c | 23 +++++++++++++++++++++++ 4 files changed, 33 insertions(+), 28 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index b13b72645db33..a98c757400fed 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -433,19 +433,10 @@ extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); #ifdef CONFIG_NUMA -extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; -#else -#define node_reclaim_mode 0 #endif -static inline bool node_reclaim_enabled(void) -{ - /* Is any node_reclaim_mode bit set? */ - return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); -} - void check_move_unevictable_folios(struct folio_batch *fbatch); extern void __meminit kswapd_run(int nid); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b14e8764d45b3..c1434adb8d506 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2051,15 +2051,6 @@ static const struct ctl_table vm_table[] = { .proc_handler = dirtytime_interval_handler, .extra1 = SYSCTL_ZERO, }, - { - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO_HUNDRED, - }, { .procname = "drop_caches", .data = &sysctl_drop_caches, @@ -2107,16 +2098,6 @@ static const struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif -#ifdef CONFIG_NUMA - { - .procname = "zone_reclaim_mode", - .data = &node_reclaim_mode, - .maxlen = sizeof(node_reclaim_mode), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif #ifdef CONFIG_MMU { .procname = "mmap_min_addr", diff --git a/mm/internal.h b/mm/internal.h index 109ef30fee11f..cdbbe4c215c02 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1097,9 +1097,13 @@ static inline void mminit_verify_zonelist(void) #define NODE_RECLAIM_SUCCESS 1 #ifdef CONFIG_NUMA +extern int node_reclaim_mode; + extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else +#define node_reclaim_mode 0 + static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { @@ -1111,6 +1115,12 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) } #endif +static inline bool node_reclaim_enabled(void) +{ + /* Is any node_reclaim_mode bit set? */ + return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); +} + /* * mm/memory-failure.c */ diff --git a/mm/vmscan.c b/mm/vmscan.c index c767d71c43d7d..eb228a8cd7695 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7404,6 +7404,28 @@ void __meminit kswapd_stop(int nid) pgdat_kswapd_unlock(pgdat); } +static const struct ctl_table vmscan_sysctl_table[] = { + { + .procname = "swappiness", + .data = &vm_swappiness, + .maxlen = sizeof(vm_swappiness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO_HUNDRED, + }, +#ifdef CONFIG_NUMA + { + .procname = "zone_reclaim_mode", + .data = &node_reclaim_mode, + .maxlen = sizeof(node_reclaim_mode), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + } +#endif +}; + static int __init kswapd_init(void) { int nid; @@ -7411,6 +7433,7 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); + register_sysctl_init("vm", vmscan_sysctl_table); return 0; } From b1e8d7134eb61677c8a3207004d1ce34305d1f35 Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:40 +0800 Subject: [PATCH 06/22] mm: util: move sysctls to mm/util.c This moves all util related sysctls to mm/util.c, as part of the kernel/sysctl.c cleaning, also removes redundant external variable declarations and function declarations. Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- include/linux/mm.h | 11 -------- include/linux/mman.h | 2 -- kernel/sysctl.c | 37 ------------------------ mm/util.c | 67 ++++++++++++++++++++++++++++++++++++++------ 4 files changed, 59 insertions(+), 58 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 23cfaa40fef38..7e0018222f640 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -205,17 +205,6 @@ extern int sysctl_max_map_count; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; -extern int sysctl_overcommit_memory; -extern int sysctl_overcommit_ratio; -extern unsigned long sysctl_overcommit_kbytes; - -int overcommit_ratio_handler(const struct ctl_table *, int, void *, size_t *, - loff_t *); -int overcommit_kbytes_handler(const struct ctl_table *, int, void *, size_t *, - loff_t *); -int overcommit_policy_handler(const struct ctl_table *, int, void *, size_t *, - loff_t *); - #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) diff --git a/include/linux/mman.h b/include/linux/mman.h index a842783ffa62b..bce214fece16b 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -59,8 +59,6 @@ | MAP_HUGE_1GB) extern int sysctl_overcommit_memory; -extern int sysctl_overcommit_ratio; -extern unsigned long sysctl_overcommit_kbytes; extern struct percpu_counter vm_committed_as; #ifdef CONFIG_SMP diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c1434adb8d506..ad62b76851e78 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2020,29 +2020,6 @@ static const struct ctl_table kern_table[] = { }; static const struct ctl_table vm_table[] = { - { - .procname = "overcommit_memory", - .data = &sysctl_overcommit_memory, - .maxlen = sizeof(sysctl_overcommit_memory), - .mode = 0644, - .proc_handler = overcommit_policy_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "overcommit_ratio", - .data = &sysctl_overcommit_ratio, - .maxlen = sizeof(sysctl_overcommit_ratio), - .mode = 0644, - .proc_handler = overcommit_ratio_handler, - }, - { - .procname = "overcommit_kbytes", - .data = &sysctl_overcommit_kbytes, - .maxlen = sizeof(sysctl_overcommit_kbytes), - .mode = 0644, - .proc_handler = overcommit_kbytes_handler, - }, { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, @@ -2123,20 +2100,6 @@ static const struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif - { - .procname = "user_reserve_kbytes", - .data = &sysctl_user_reserve_kbytes, - .maxlen = sizeof(sysctl_user_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "admin_reserve_kbytes", - .data = &sysctl_admin_reserve_kbytes, - .maxlen = sizeof(sysctl_admin_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS { .procname = "mmap_rnd_bits", diff --git a/mm/util.c b/mm/util.c index b6b9684a14388..b3480f358d114 100644 --- a/mm/util.c +++ b/mm/util.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -906,14 +907,16 @@ int folio_mc_copy(struct folio *dst, struct folio *src) EXPORT_SYMBOL(folio_mc_copy); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; -int sysctl_overcommit_ratio __read_mostly = 50; -unsigned long sysctl_overcommit_kbytes __read_mostly; +static int sysctl_overcommit_ratio __read_mostly = 50; +static unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -int overcommit_ratio_handler(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +#ifdef CONFIG_SYSCTL + +static int overcommit_ratio_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -928,8 +931,8 @@ static void sync_overcommit_as(struct work_struct *dummy) percpu_counter_sync(&vm_committed_as); } -int overcommit_policy_handler(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +static int overcommit_policy_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int new_policy = -1; @@ -964,8 +967,8 @@ int overcommit_policy_handler(const struct ctl_table *table, int write, void *bu return ret; } -int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +static int overcommit_kbytes_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -975,6 +978,54 @@ int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *bu return ret; } +static const struct ctl_table util_sysctl_table[] = { + { + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, + .maxlen = sizeof(sysctl_overcommit_memory), + .mode = 0644, + .proc_handler = overcommit_policy_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, + .maxlen = sizeof(sysctl_overcommit_ratio), + .mode = 0644, + .proc_handler = overcommit_ratio_handler, + }, + { + .procname = "overcommit_kbytes", + .data = &sysctl_overcommit_kbytes, + .maxlen = sizeof(sysctl_overcommit_kbytes), + .mode = 0644, + .proc_handler = overcommit_kbytes_handler, + }, + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, + .maxlen = sizeof(sysctl_user_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "admin_reserve_kbytes", + .data = &sysctl_admin_reserve_kbytes, + .maxlen = sizeof(sysctl_admin_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +}; + +static int __init init_vm_util_sysctls(void) +{ + register_sysctl_init("vm", util_sysctl_table); + return 0; +} +subsys_initcall(init_vm_util_sysctls); +#endif /* CONFIG_SYSCTL */ + /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ From aacdde7202140c8aa4a7a600ad410b11af3b7e4f Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:41 +0800 Subject: [PATCH 07/22] mm: mmap: move sysctl to mm/mmap.c This moves all mmap related sysctls to mm/mmap.c, as part of the kernel/sysctl.c cleaning, also move the variable declaration from kernel/sysctl.c into mm/mmap.c. Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Lorenzo Stoakes Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- kernel/sysctl.c | 50 +-------------------------------------------- mm/mmap.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 49 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ad62b76851e78..98a95396d20b7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -127,12 +127,6 @@ enum sysctl_writes_mode { static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; #endif /* CONFIG_PROC_SYSCTL */ - -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) -int sysctl_legacy_va_layout; -#endif - #endif /* CONFIG_SYSCTL */ /* @@ -2037,16 +2031,7 @@ static const struct ctl_table vm_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_FOUR, }, -#ifdef CONFIG_MMU - { - .procname = "max_map_count", - .data = &sysctl_max_map_count, - .maxlen = sizeof(sysctl_max_map_count), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#else +#ifndef CONFIG_MMU { .procname = "nr_trim_pages", .data = &sysctl_nr_trim_pages, @@ -2064,17 +2049,6 @@ static const struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) - { - .procname = "legacy_va_layout", - .data = &sysctl_legacy_va_layout, - .maxlen = sizeof(sysctl_legacy_va_layout), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif #ifdef CONFIG_MMU { .procname = "mmap_min_addr", @@ -2100,28 +2074,6 @@ static const struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS - { - .procname = "mmap_rnd_bits", - .data = &mmap_rnd_bits, - .maxlen = sizeof(mmap_rnd_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_bits_min, - .extra2 = (void *)&mmap_rnd_bits_max, - }, -#endif -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS - { - .procname = "mmap_rnd_compat_bits", - .data = &mmap_rnd_compat_bits, - .maxlen = sizeof(mmap_rnd_compat_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_compat_bits_min, - .extra2 = (void *)&mmap_rnd_compat_bits_max, - }, -#endif }; int __init sysctl_init_bases(void) diff --git a/mm/mmap.c b/mm/mmap.c index cda01071c7b1f..d6bbe435bd990 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1543,6 +1543,57 @@ struct vm_area_struct *_install_special_mapping( &special_mapping_vmops); } +#ifdef CONFIG_SYSCTL +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) +int sysctl_legacy_va_layout; +#endif + +static const struct ctl_table mmap_table[] = { + { + .procname = "max_map_count", + .data = &sysctl_max_map_count, + .maxlen = sizeof(sysctl_max_map_count), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) + { + .procname = "legacy_va_layout", + .data = &sysctl_legacy_va_layout, + .maxlen = sizeof(sysctl_legacy_va_layout), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif +}; +#endif /* CONFIG_SYSCTL */ + /* * initialise the percpu counter for VM */ @@ -1552,6 +1603,9 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", mmap_table); +#endif } /* From b121dd4d557212067275e988137f2e2c5b2c0077 Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:42 +0800 Subject: [PATCH 08/22] security: min_addr: move sysctl to security/min_addr.c The dac_mmap_min_addr belongs to min_addr.c, move it to min_addr.c from /kernel/sysctl.c. In the previous Linux kernel boot process, sysctl_init_bases needs to be executed before init_mmap_min_addr, So, register_sysctl_init should be executed before update_mmap_min_addr in init_mmap_min_addr. And according to the compilation condition in security/Makefile: obj-$(CONFIG_MMU) += min_addr.o if CONFIG_MMU is not defined, min_addr.c would not be included in the compilation process. So, drop the CONFIG_MMU check. Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Acked-by: Paul Moore Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- kernel/sysctl.c | 9 --------- security/min_addr.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 98a95396d20b7..c9e91ea550d7c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2049,15 +2049,6 @@ static const struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, -#ifdef CONFIG_MMU - { - .procname = "mmap_min_addr", - .data = &dac_mmap_min_addr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = mmap_min_addr_handler, - }, -#endif #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { diff --git a/security/min_addr.c b/security/min_addr.c index 0ce267c041ab2..df1bc643d886b 100644 --- a/security/min_addr.c +++ b/security/min_addr.c @@ -44,8 +44,19 @@ int mmap_min_addr_handler(const struct ctl_table *table, int write, return ret; } +static const struct ctl_table min_addr_sysctl_table[] = { + { + .procname = "mmap_min_addr", + .data = &dac_mmap_min_addr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = mmap_min_addr_handler, + }, +}; + static int __init init_mmap_min_addr(void) { + register_sysctl_init("vm", min_addr_sysctl_table); update_mmap_min_addr(); return 0; From 97f5420ef1f4cc5e48ac90f5496039b31cf74036 Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:43 +0800 Subject: [PATCH 09/22] mm: nommu: move sysctl to mm/nommu.c The sysctl_nr_trim_pages belongs to nommu.c, move it to mm/nommu.c from /kernel/sysctl.c. And remove the useless extern variable declaration from include/linux/mm.h Signed-off-by: Kaixiong Yu Reviewed-by: Lorenzo Stoakes Signed-off-by: Joel Granados --- include/linux/mm.h | 2 -- kernel/sysctl.c | 10 ---------- mm/nommu.c | 15 ++++++++++++++- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7e0018222f640..1c470e505bd47 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4066,8 +4066,6 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr); #endif -extern int sysctl_nr_trim_pages; - #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c9e91ea550d7c..e923e4fd1989f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2031,16 +2031,6 @@ static const struct ctl_table vm_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_FOUR, }, -#ifndef CONFIG_MMU - { - .procname = "nr_trim_pages", - .data = &sysctl_nr_trim_pages, - .maxlen = sizeof(sysctl_nr_trim_pages), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif { .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, diff --git a/mm/nommu.c b/mm/nommu.c index baa79abdaf037..3c32f8b1eb541 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -48,7 +48,6 @@ struct page *mem_map; unsigned long max_mapnr; EXPORT_SYMBOL(max_mapnr); unsigned long highest_memmap_pfn; -int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; @@ -392,6 +391,19 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) return mm->brk = brk; } +static int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; + +static const struct ctl_table nommu_table[] = { + { + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +}; + /* * initialise the percpu counter for VM and region record slabs */ @@ -402,6 +414,7 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); + register_sysctl_init("vm", nommu_table); } /* From 94eed61d5877c39b9dcdb9079aa6e0907aeb5f7d Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:44 +0800 Subject: [PATCH 10/22] fs: fs-writeback: move sysctl to fs/fs-writeback.c The dirtytime_expire_interval belongs to fs/fs-writeback.c, move it to fs/fs-writeback.c from /kernel/sysctl.c. And remove the useless extern variable declaration and the function declaration from include/linux/writeback.h Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jan Kara Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- fs/fs-writeback.c | 30 +++++++++++++++++++++--------- include/linux/writeback.h | 4 ---- kernel/sysctl.c | 8 -------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3cd99e2dc6ac6..cc57367fb641d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -65,7 +65,7 @@ struct wb_writeback_work { * timestamps written to disk after 12 hours, but in the worst case a * few inodes might not their timestamps updated for 24 hours. */ -unsigned int dirtytime_expire_interval = 12 * 60 * 60; +static unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { @@ -2435,14 +2435,7 @@ static void wakeup_dirtytime_writeback(struct work_struct *w) schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); } -static int __init start_dirtytime_writeback(void) -{ - schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); - return 0; -} -__initcall(start_dirtytime_writeback); - -int dirtytime_interval_handler(const struct ctl_table *table, int write, +static int dirtytime_interval_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2453,6 +2446,25 @@ int dirtytime_interval_handler(const struct ctl_table *table, int write, return ret; } +static const struct ctl_table vm_fs_writeback_table[] = { + { + .procname = "dirtytime_expire_seconds", + .data = &dirtytime_expire_interval, + .maxlen = sizeof(dirtytime_expire_interval), + .mode = 0644, + .proc_handler = dirtytime_interval_handler, + .extra1 = SYSCTL_ZERO, + }, +}; + +static int __init start_dirtytime_writeback(void) +{ + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); + register_sysctl_init("vm", vm_fs_writeback_table); + return 0; +} +__initcall(start_dirtytime_writeback); + /** * __mark_inode_dirty - internal function to mark an inode dirty * diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d11b903c2edb8..caf4f0b12235e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -327,12 +327,8 @@ extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; -extern unsigned int dirtytime_expire_interval; extern int laptop_mode; -int dirtytime_interval_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); - void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); unsigned long cgwb_calc_thresh(struct bdi_writeback *wb); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e923e4fd1989f..40d68fb547791 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2014,14 +2014,6 @@ static const struct ctl_table kern_table[] = { }; static const struct ctl_table vm_table[] = { - { - .procname = "dirtytime_expire_seconds", - .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirtytime_expire_interval), - .mode = 0644, - .proc_handler = dirtytime_interval_handler, - .extra1 = SYSCTL_ZERO, - }, { .procname = "drop_caches", .data = &sysctl_drop_caches, From f5d64ae331d00c27cc6fc615811a9ba13de5fb0e Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:45 +0800 Subject: [PATCH 11/22] fs: drop_caches: move sysctl to fs/drop_caches.c The sysctl_drop_caches to fs/drop_caches.c, move it to fs/drop_caches.c from /kernel/sysctl.c. And remove the useless extern variable declaration from include/linux/mm.h Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- fs/drop_caches.c | 23 +++++++++++++++++++++-- include/linux/mm.h | 6 ------ kernel/sysctl.c | 9 --------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/fs/drop_caches.c b/fs/drop_caches.c index d45ef541d848a..019a8b4eaaf99 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -14,7 +14,7 @@ #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ -int sysctl_drop_caches; +static int sysctl_drop_caches; static void drop_pagecache_sb(struct super_block *sb, void *unused) { @@ -48,7 +48,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) iput(toput_inode); } -int drop_caches_sysctl_handler(const struct ctl_table *table, int write, +static int drop_caches_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret; @@ -77,3 +77,22 @@ int drop_caches_sysctl_handler(const struct ctl_table *table, int write, } return 0; } + +static const struct ctl_table drop_caches_table[] = { + { + .procname = "drop_caches", + .data = &sysctl_drop_caches, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = drop_caches_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_FOUR, + }, +}; + +static int __init init_vm_drop_caches_sysctls(void) +{ + register_sysctl_init("vm", drop_caches_table); + return 0; +} +fs_initcall(init_vm_drop_caches_sysctls); diff --git a/include/linux/mm.h b/include/linux/mm.h index 1c470e505bd47..f5ba3ed8b44a3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3789,12 +3789,6 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); -#ifdef CONFIG_SYSCTL -extern int sysctl_drop_caches; -int drop_caches_sysctl_handler(const struct ctl_table *, int, void *, size_t *, - loff_t *); -#endif - void drop_slab(void); #ifndef CONFIG_MMU diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 40d68fb547791..b7142ed5c48dd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2014,15 +2014,6 @@ static const struct ctl_table kern_table[] = { }; static const struct ctl_table vm_table[] = { - { - .procname = "drop_caches", - .data = &sysctl_drop_caches, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = drop_caches_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_FOUR, - }, { .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, From c8c3fd194678e99460f22fffa170f7628e3e171c Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:46 +0800 Subject: [PATCH 12/22] sunrpc: simplify rpcauth_cache_shrink_count() It is inappropriate to use sysctl_vfs_cache_pressure here. The sysctl is documented as: This percentage value controls the tendency of the kernel to reclaim the memory which is used for caching of directory and inode objects. So, simplify result of rpcauth_cache_shrink_count() to "return number_cred_unused;". Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Acked-by: Anna Schumaker Acked-by: Jeff Layton Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- net/sunrpc/auth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 04534ea537c8f..5a827afd8e3b2 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -489,7 +489,7 @@ static unsigned long rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - return number_cred_unused * sysctl_vfs_cache_pressure / 100; + return number_cred_unused; } static void From 52e66823e0bea9adbe3e16075b5d76867ca695aa Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:47 +0800 Subject: [PATCH 13/22] fs: dcache: move the sysctl to fs/dcache.c The sysctl_vfs_cache_pressure belongs to fs/dcache.c, move it to fs/dcache.c from kernel/sysctl.c. As a part of fs/dcache.c cleaning, sysctl_vfs_cache_pressure is changed to a static variable, and change the inline-type function vfs_pressure_ratio() to out-of-inline type, export vfs_pressure_ratio() with EXPORT_SYMBOL_GPL to be used by other files. Move the unneeded include(linux/dcache.h). Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- fs/dcache.c | 21 +++++++++++++++++++-- include/linux/dcache.h | 7 +------ kernel/sysctl.c | 9 --------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 9cc0d47da321c..329bbb1e13359 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -73,8 +73,13 @@ * If no ancestor relationship: * arbitrary, since it's serialized on rename_lock */ -int sysctl_vfs_cache_pressure __read_mostly = 100; -EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); +static int sysctl_vfs_cache_pressure __read_mostly = 100; + +unsigned long vfs_pressure_ratio(unsigned long val) +{ + return mult_frac(val, sysctl_vfs_cache_pressure, 100); +} +EXPORT_SYMBOL_GPL(vfs_pressure_ratio); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); @@ -211,8 +216,20 @@ static const struct ctl_table fs_dcache_sysctls[] = { }, }; +static const struct ctl_table vm_dcache_sysctls[] = { + { + .procname = "vfs_cache_pressure", + .data = &sysctl_vfs_cache_pressure, + .maxlen = sizeof(sysctl_vfs_cache_pressure), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +}; + static int __init init_fs_dcache_sysctls(void) { + register_sysctl_init("vm", vm_dcache_sysctls); register_sysctl_init("fs", fs_dcache_sysctls); return 0; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4afb603656756..f861e709b3850 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -520,12 +520,7 @@ static inline int simple_positive(const struct dentry *dentry) return d_really_is_positive(dentry) && !d_unhashed(dentry); } -extern int sysctl_vfs_cache_pressure; - -static inline unsigned long vfs_pressure_ratio(unsigned long val) -{ - return mult_frac(val, sysctl_vfs_cache_pressure, 100); -} +unsigned long vfs_pressure_ratio(unsigned long val); /** * d_inode - Get the actual inode of this dentry diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b7142ed5c48dd..7e345c7148a17 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -2014,14 +2013,6 @@ static const struct ctl_table kern_table[] = { }; static const struct ctl_table vm_table[] = { - { - .procname = "vfs_cache_pressure", - .data = &sysctl_vfs_cache_pressure, - .maxlen = sizeof(sysctl_vfs_cache_pressure), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { From a33e288147d549347484f95134cee012d757cc6f Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:48 +0800 Subject: [PATCH 14/22] x86: vdso: move the sysctl to arch/x86/entry/vdso/vdso32-setup.c When CONFIG_X86_32 is defined and CONFIG_UML is not defined, vdso_enabled belongs to arch/x86/entry/vdso/vdso32-setup.c. So, move it into its own file. Before this patch, vdso_enabled was allowed to be set to a value exceeding 1 on x86_32 architecture. After this patch is applied, vdso_enabled is not permitted to set the value more than 1. It does not matter, because according to the function load_vdso32(), only vdso_enabled is set to 1, VDSO would be enabled. Other values all mean "disabled". The same limitation could be seen in the function vdso32_setup(). Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- arch/x86/entry/vdso/vdso32-setup.c | 16 +++++++++++----- kernel/sysctl.c | 8 +------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index f6d2d8aba6434..8894013eea1da 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -51,15 +51,17 @@ __setup("vdso32=", vdso32_setup); __setup_param("vdso=", vdso_setup, vdso32_setup, 0); #endif -#ifdef CONFIG_X86_64 #ifdef CONFIG_SYSCTL -/* Register vsyscall32 into the ABI table */ #include -static const struct ctl_table abi_table2[] = { +static const struct ctl_table vdso_table[] = { { +#ifdef CONFIG_X86_64 .procname = "vsyscall32", +#else + .procname = "vdso_enabled", +#endif .data = &vdso32_enabled, .maxlen = sizeof(int), .mode = 0644, @@ -71,10 +73,14 @@ static const struct ctl_table abi_table2[] = { static __init int ia32_binfmt_init(void) { - register_sysctl("abi", abi_table2); +#ifdef CONFIG_X86_64 + /* Register vsyscall32 into the ABI table */ + register_sysctl("abi", vdso_table); +#else + register_sysctl_init("vm", vdso_table); +#endif return 0; } __initcall(ia32_binfmt_init); #endif /* CONFIG_SYSCTL */ -#endif /* CONFIG_X86_64 */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7e345c7148a17..c4833a555bd2c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2013,17 +2013,11 @@ static const struct ctl_table kern_table[] = { }; static const struct ctl_table vm_table[] = { -#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ - (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) +#if defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL) { .procname = "vdso_enabled", -#ifdef CONFIG_X86_32 - .data = &vdso32_enabled, - .maxlen = sizeof(vdso32_enabled), -#else .data = &vdso_enabled, .maxlen = sizeof(vdso_enabled), -#endif .mode = 0644, .proc_handler = proc_dointvec, .extra1 = SYSCTL_ZERO, From f569ca4b145d07fd7c3ac1349b7da40f43c38e7b Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:49 +0800 Subject: [PATCH 15/22] sh: vdso: move the sysctl to arch/sh/kernel/vsyscall/vsyscall.c When CONFIG_SUPERH and CONFIG_VSYSCALL are defined, vdso_enabled belongs to arch/sh/kernel/vsyscall/vsyscall.c. So, move it into its own file. To avoid failure when registering the vdso_table, move the call to register_sysctl_init() into its own fs_initcall(). Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- arch/sh/kernel/vsyscall/vsyscall.c | 20 ++++++++++++++++++++ kernel/sysctl.c | 13 +------------ 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c index add35c51e0178..d80dd92a483af 100644 --- a/arch/sh/kernel/vsyscall/vsyscall.c +++ b/arch/sh/kernel/vsyscall/vsyscall.c @@ -14,6 +14,7 @@ #include #include #include +#include #include /* @@ -30,6 +31,17 @@ static int __init vdso_setup(char *s) } __setup("vdso=", vdso_setup); +static const struct ctl_table vdso_table[] = { + { + .procname = "vdso_enabled", + .data = &vdso_enabled, + .maxlen = sizeof(vdso_enabled), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + }, +}; + /* * These symbols are defined by vsyscall.o to mark the bounds * of the ELF DSO images included therein. @@ -58,6 +70,14 @@ int __init vsyscall_init(void) return 0; } +static int __init vm_sysctl_init(void) +{ + register_sysctl_init("vm", vdso_table); + return 0; +} + +fs_initcall(vm_sysctl_init); + /* Setup a VMA at program startup for the vsyscall page */ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c4833a555bd2c..ebe7c19abffb0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2012,18 +2012,7 @@ static const struct ctl_table kern_table[] = { #endif }; -static const struct ctl_table vm_table[] = { -#if defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL) - { - .procname = "vdso_enabled", - .data = &vdso_enabled, - .maxlen = sizeof(vdso_enabled), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#endif -}; +static const struct ctl_table vm_table[] = {}; int __init sysctl_init_bases(void) { From fa89dbda458b915dcb2f099c1fee495bde28943e Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:50 +0800 Subject: [PATCH 16/22] sysctl: remove the vm_table After patch1~14 is applied, all sysctls of vm_table would be moved. So, delete vm_table. Signed-off-by: Kaixiong Yu Signed-off-by: Joel Granados --- kernel/sysctl.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ebe7c19abffb0..4f2d93f701e53 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2012,12 +2012,9 @@ static const struct ctl_table kern_table[] = { #endif }; -static const struct ctl_table vm_table[] = {}; - int __init sysctl_init_bases(void) { register_sysctl_init("kernel", kern_table); - register_sysctl_init("vm", vm_table); return 0; } From dccf3c99febf09890d7fa99b8529f021d46787f9 Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:51 +0800 Subject: [PATCH 17/22] sysctl: remove unneeded include Removing unneeded mm includes in kernel/sysctl.c. Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Kalesh AP Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- kernel/sysctl.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4f2d93f701e53..eeb4cba121903 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -20,8 +20,6 @@ */ #include -#include -#include #include #include #include @@ -30,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -41,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -52,13 +48,11 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include "../lib/kstrtox.h" From 049439e22825507a90d4dedf3934e24fd0a8ff62 Mon Sep 17 00:00:00 2001 From: Nicolas Bouchinet Date: Wed, 15 Jan 2025 14:22:08 +0100 Subject: [PATCH 18/22] coredump: Fixes core_pipe_limit sysctl proc_handler proc_dointvec converts a string to a vector of signed int, which is stored in the unsigned int .data core_pipe_limit. It was thus authorized to write a negative value to core_pipe_limit sysctl which once stored in core_pipe_limit, leads to the signed int dump_count check against core_pipe_limit never be true. The same can be achieved with core_pipe_limit set to INT_MAX. Any negative write or >= to INT_MAX in core_pipe_limit sysctl would hypothetically allow a user to create very high load on the system by running processes that produces a coredump in case the core_pattern sysctl is configured to pipe core files to user space helper. Memory or PID exhaustion should happen before but it anyway breaks the core_pipe_limit semantic. This commit fixes this by changing core_pipe_limit sysctl's proc_handler to proc_dointvec_minmax and bound checking between SYSCTL_ZERO and SYSCTL_INT_MAX. Fixes: a293980c2e26 ("exec: let do_coredump() limit the number of concurrent dumps to pipes") Signed-off-by: Nicolas Bouchinet Reviewed-by: Jan Kara Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- fs/coredump.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/coredump.c b/fs/coredump.c index 591700e1b2ce6..dd0957149ec6b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1015,7 +1015,9 @@ static const struct ctl_table coredump_sysctls[] = { .data = &core_pipe_limit, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "core_file_note_size_limit", From a231f5bdfdd0ac402656d82ee7e6c54ec86c0c3b Mon Sep 17 00:00:00 2001 From: Nicolas Bouchinet Date: Wed, 15 Jan 2025 14:22:09 +0100 Subject: [PATCH 19/22] sysctl: Fix underflow value setting risk in vm_table Commit 3b3376f222e3 ("sysctl.c: fix underflow value setting risk in vm_table") fixes underflow value setting risk in vm_table but misses vdso_enabled sysctl. vdso_enabled sysctl is initialized with .extra1 value as SYSCTL_ZERO to avoid negative value writes but the proc_handler is proc_dointvec and not proc_dointvec_minmax and thus do not uses .extra1 and .extra2. The following command thus works : `# echo -1 > /proc/sys/vm/vdso_enabled` This patch properly sets the proc_handler to proc_dointvec_minmax. In addition to .extra1, .extra2 is set to SYSCTL_ONE. The sysctl is thus bounded between 0 and 1. Fixes: 3b3376f222e3 ("sysctl.c: fix underflow value setting risk in vm_table") Signed-off-by: Nicolas Bouchinet Reviewed-by: Jan Kara Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- arch/sh/kernel/vsyscall/vsyscall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c index d80dd92a483af..1563dcc55fd32 100644 --- a/arch/sh/kernel/vsyscall/vsyscall.c +++ b/arch/sh/kernel/vsyscall/vsyscall.c @@ -37,8 +37,9 @@ static const struct ctl_table vdso_table[] = { .data = &vdso_enabled, .maxlen = sizeof(vdso_enabled), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, }; From 2694b6bb871d4a93ed29ca47991a9b823ea744f6 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 30 Dec 2024 11:15:00 +0100 Subject: [PATCH 20/22] MAINTAINERS: Update sysctl file list in MAINTAINERS Add kunit (lib/test_sysctl.c) and doc check script (scripts/check-sysctl-docs) to the list of files in MAINTAINERS. Signed-off-by: Joel Granados --- MAINTAINERS | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 896a307fa0654..1f4f40aa18cc1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18930,9 +18930,10 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/sysctl/sysctl.git sysctl-next F: fs/proc/proc_sysctl.c F: include/linux/sysctl.h -F: kernel/sysctl-test.c -F: kernel/sysctl.c -F: tools/testing/selftests/sysctl/ +F: kernel/sysctl* +F: tools/testing/selftests/sysctl/* +F: lib/test_sysctl.c +F: scripts/check-sysctl-docs PS3 NETWORK SUPPORT M: Geoff Levand From ae9ebda1bc32b865b895d5331a00f24f8fd324e6 Mon Sep 17 00:00:00 2001 From: Chandra Pratap Date: Tue, 28 Jan 2025 16:06:55 +0530 Subject: [PATCH 21/22] selftests: fix spelling/grammar errors in sysctl/sysctl.sh Fix the grammatical/spelling errors in sysctl/sysctl.sh. This fixes all errors pointed out by codespell in the file. Signed-off-by: Chandra Pratap Signed-off-by: Joel Granados --- tools/testing/selftests/sysctl/sysctl.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index 84472b436c07f..f6e129a82ffd8 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -21,7 +21,7 @@ TEST_FILE=$(mktemp) # ENABLED: 1 if enabled, 0 otherwise # TARGET: test target file required on the test_sysctl module # SKIP_NO_TARGET: 1 skip if TARGET not there -# 0 run eventhough TARGET not there +# 0 run even though TARGET not there # # Once these are enabled please leave them as-is. Write your own test, # we have tons of space. @@ -764,7 +764,7 @@ sysctl_test_0007() fi if [ ! -f /proc/cmdline ]; then - echo -e "SKIPPING\nThere is no /proc/cmdline to check for paramter" + echo -e "SKIPPING\nThere is no /proc/cmdline to check for parameter" return $ksft_skip fi @@ -898,7 +898,7 @@ usage() echo Example uses: echo echo "$TEST_NAME.sh -- executes all tests" - echo "$TEST_NAME.sh -t 0002 -- Executes test ID 0002 number of times is recomended" + echo "$TEST_NAME.sh -t 0002 -- Executes test ID 0002 number of times is recommended" echo "$TEST_NAME.sh -w 0002 -- Watch test ID 0002 run until an error occurs" echo "$TEST_NAME.sh -s 0002 -- Run test ID 0002 once" echo "$TEST_NAME.sh -c 0002 3 -- Run test ID 0002 three times" From 29fa7d7934216e0a93102a930ef28e2a6ae852b1 Mon Sep 17 00:00:00 2001 From: Bharadwaj Raju Date: Fri, 21 Feb 2025 15:51:49 +0530 Subject: [PATCH 22/22] selftests/sysctl: fix wording of help messages Change wording of test number recommendation to "the recommended number of times". Signed-off-by: Bharadwaj Raju Signed-off-by: Joel Granados --- tools/testing/selftests/sysctl/sysctl.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index f6e129a82ffd8..db1616857d898 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -857,7 +857,7 @@ list_tests() echo echo "TEST_ID x NUM_TEST" echo "TEST_ID: Test ID" - echo "NUM_TESTS: Number of recommended times to run the test" + echo "NUM_TESTS: Recommended number of times to run the test" echo echo "0001 x $(get_test_count 0001) - tests proc_dointvec_minmax()" echo "0002 x $(get_test_count 0002) - tests proc_dostring()" @@ -884,7 +884,7 @@ usage() echo "Valid tests: 0001-$MAX_TEST" echo "" echo " all Runs all tests (default)" - echo " -t Run test ID the number amount of times is recommended" + echo " -t Run test ID the recommended number of times" echo " -w Watch test ID run until it runs into an error" echo " -c Run test ID once" echo " -s Run test ID x test-count number of times" @@ -898,7 +898,7 @@ usage() echo Example uses: echo echo "$TEST_NAME.sh -- executes all tests" - echo "$TEST_NAME.sh -t 0002 -- Executes test ID 0002 number of times is recommended" + echo "$TEST_NAME.sh -t 0002 -- Executes test ID 0002 the recommended number of times" echo "$TEST_NAME.sh -w 0002 -- Watch test ID 0002 run until an error occurs" echo "$TEST_NAME.sh -s 0002 -- Run test ID 0002 once" echo "$TEST_NAME.sh -c 0002 3 -- Run test ID 0002 three times"