Skip to content

Commit

Permalink
Merge branch 'mm-hotfixes-stable' into mm-stable.
Browse files Browse the repository at this point in the history
Pick these into mm-stable:

5de1950 mm: resolve faulty mmap_region() error path behaviour
5baf8b0 mm: refactor arch_calc_vm_flag_bits() and arm64 MTE handling
0fb4a7a mm: refactor map_deny_write_exec()
4080ef1 mm: unconditionally close VMAs on error
3dd6ed3 mm: avoid unsafe VMA hook invocation when error arises on mmap hook
f8f931b mm/thp: fix deferred split unqueue naming and locking
e66f318 mm/thp: fix deferred split queue not partially_mapped

to get a clean merge of these from mm-unstable into mm-stable:

Subject: memcg-v1: fully deprecate move_charge_at_immigrate
Subject: memcg-v1: remove charge move code
Subject: memcg-v1: no need for memcg locking for dirty tracking
Subject: memcg-v1: no need for memcg locking for writeback tracking
Subject: memcg-v1: no need for memcg locking for MGLRU
Subject: memcg-v1: remove memcg move locking code
Subject: tools: testing: add additional vma_internal.h stubs
Subject: mm: isolate mmap internal logic to mm/vma.c
Subject: mm: refactor __mmap_region()
Subject: mm: remove unnecessary reset state logic on merge new VMA
Subject: mm: defer second attempt at merge on mmap()
Subject: mm/vma: the pgoff is correct if can_merge_right
Subject: memcg: workingset: remove folio_memcg_rcu usage
  • Loading branch information
Andrew Morton committed Nov 6, 2024
2 parents 59b723c + 5de1950 commit 48901e9
Show file tree
Hide file tree
Showing 17 changed files with 245 additions and 125 deletions.
10 changes: 7 additions & 3 deletions arch/arm64/include/asm/mman.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#ifndef BUILD_VDSO
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/shmem_fs.h>
#include <linux/types.h>

static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
Expand All @@ -31,19 +33,21 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
}
#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)

static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
static inline unsigned long arch_calc_vm_flag_bits(struct file *file,
unsigned long flags)
{
/*
* Only allow MTE on anonymous mappings as these are guaranteed to be
* backed by tags-capable memory. The vm_flags may be overridden by a
* filesystem supporting MTE (RAM-based).
*/
if (system_supports_mte() && (flags & MAP_ANONYMOUS))
if (system_supports_mte() &&
((flags & MAP_ANONYMOUS) || shmem_file(file)))
return VM_MTE_ALLOWED;

return 0;
}
#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)

static inline bool arch_validate_prot(unsigned long prot,
unsigned long addr __always_unused)
Expand Down
5 changes: 3 additions & 2 deletions arch/parisc/include/asm/mman.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#ifndef __ASM_MMAN_H__
#define __ASM_MMAN_H__

#include <linux/fs.h>
#include <uapi/asm/mman.h>

/* PARISC cannot allow mdwe as it needs writable stacks */
Expand All @@ -11,7 +12,7 @@ static inline bool arch_memory_deny_write_exec_supported(void)
}
#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported

static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
static inline unsigned long arch_calc_vm_flag_bits(struct file *file, unsigned long flags)
{
/*
* The stack on parisc grows upwards, so if userspace requests memory
Expand All @@ -23,6 +24,6 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)

return 0;
}
#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)

#endif /* __ASM_MMAN_H__ */
28 changes: 22 additions & 6 deletions include/linux/mman.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#ifndef _LINUX_MMAN_H
#define _LINUX_MMAN_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/percpu_counter.h>

Expand Down Expand Up @@ -94,7 +95,7 @@ static inline void vm_unacct_memory(long pages)
#endif

#ifndef arch_calc_vm_flag_bits
#define arch_calc_vm_flag_bits(flags) 0
#define arch_calc_vm_flag_bits(file, flags) 0
#endif

#ifndef arch_validate_prot
Expand Down Expand Up @@ -151,13 +152,13 @@ calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
* Combine the mmap "flags" argument into "vm_flags" used internally.
*/
static inline unsigned long
calc_vm_flag_bits(unsigned long flags)
calc_vm_flag_bits(struct file *file, unsigned long flags)
{
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
_calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) |
arch_calc_vm_flag_bits(flags);
arch_calc_vm_flag_bits(file, flags);
}

unsigned long vm_commit_limit(void);
Expand Down Expand Up @@ -188,16 +189,31 @@ static inline bool arch_memory_deny_write_exec_supported(void)
*
* d) mmap(PROT_READ | PROT_EXEC)
* mmap(PROT_READ | PROT_EXEC | PROT_BTI)
*
* This is only applicable if the user has set the Memory-Deny-Write-Execute
* (MDWE) protection mask for the current process.
*
* @old specifies the VMA flags the VMA originally possessed, and @new the ones
* we propose to set.
*
* Return: false if proposed change is OK, true if not ok and should be denied.
*/
static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags)
static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
{
/* If MDWE is disabled, we have nothing to deny. */
if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
return false;

if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
/* If the new VMA is not executable, we have nothing to deny. */
if (!(new & VM_EXEC))
return false;

/* Under MDWE we do not accept newly writably executable VMAs... */
if (new & VM_WRITE)
return true;

if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
/* ...nor previously non-executable VMAs becoming executable. */
if (!(old & VM_EXEC))
return true;

return false;
Expand Down
56 changes: 43 additions & 13 deletions mm/huge_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -3588,10 +3588,27 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
return split_huge_page_to_list_to_order(&folio->page, list, ret);
}

void __folio_undo_large_rmappable(struct folio *folio)
/*
* __folio_unqueue_deferred_split() is not to be called directly:
* the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
* limits its calls to those folios which may have a _deferred_list for
* queueing THP splits, and that list is (racily observed to be) non-empty.
*
* It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
* zero: because even when split_queue_lock is held, a non-empty _deferred_list
* might be in use on deferred_split_scan()'s unlocked on-stack list.
*
* If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
* therefore important to unqueue deferred split before changing folio memcg.
*/
bool __folio_unqueue_deferred_split(struct folio *folio)
{
struct deferred_split *ds_queue;
unsigned long flags;
bool unqueued = false;

WARN_ON_ONCE(folio_ref_count(folio));
WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));

ds_queue = get_deferred_split_queue(folio);
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
Expand All @@ -3603,8 +3620,11 @@ void __folio_undo_large_rmappable(struct folio *folio)
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
list_del_init(&folio->_deferred_list);
unqueued = true;
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);

return unqueued; /* useful for debug warnings */
}

/* partially_mapped=false won't clear PG_partially_mapped folio flag */
Expand All @@ -3627,14 +3647,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
return;

/*
* The try_to_unmap() in page reclaim path might reach here too,
* this may cause a race condition to corrupt deferred split queue.
* And, if page reclaim is already handling the same folio, it is
* unnecessary to handle it again in shrinker.
*
* Check the swapcache flag to determine if the folio is being
* handled by page reclaim since THP swap would add the folio into
* swap cache before calling try_to_unmap().
* Exclude swapcache: originally to avoid a corrupt deferred split
* queue. Nowadays that is fully prevented by mem_cgroup_swapout();
* but if page reclaim is already handling the same folio, it is
* unnecessary to handle it again in the shrinker, so excluding
* swapcache here may still be a useful optimization.
*/
if (folio_test_swapcache(folio))
return;
Expand Down Expand Up @@ -3718,8 +3735,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
unsigned long flags;
LIST_HEAD(list);
struct folio *folio, *next;
int split = 0;
struct folio *folio, *next, *prev = NULL;
int split = 0, removed = 0;

#ifdef CONFIG_MEMCG
if (sc->memcg)
Expand Down Expand Up @@ -3775,15 +3792,28 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
*/
if (!did_split && !folio_test_partially_mapped(folio)) {
list_del_init(&folio->_deferred_list);
ds_queue->split_queue_len--;
removed++;
} else {
/*
* That unlocked list_del_init() above would be unsafe,
* unless its folio is separated from any earlier folios
* left on the list (which may be concurrently unqueued)
* by one safe folio with refcount still raised.
*/
swap(folio, prev);
}
folio_put(folio);
if (folio)
folio_put(folio);
}

spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
list_splice_tail(&list, &ds_queue->split_queue);
ds_queue->split_queue_len -= removed;
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);

if (prev)
folio_put(prev);

/*
* Stop shrinker if we didn't split any page, but the queue is empty.
* This can happen if pages were freed under us.
Expand Down
55 changes: 50 additions & 5 deletions mm/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,51 @@ static inline void *folio_raw_mapping(const struct folio *folio)
return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
}

/*
* This is a file-backed mapping, and is about to be memory mapped - invoke its
* mmap hook and safely handle error conditions. On error, VMA hooks will be
* mutated.
*
* @file: File which backs the mapping.
* @vma: VMA which we are mapping.
*
* Returns: 0 if success, error otherwise.
*/
static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
{
int err = call_mmap(file, vma);

if (likely(!err))
return 0;

/*
* OK, we tried to call the file hook for mmap(), but an error
* arose. The mapping is in an inconsistent state and we most not invoke
* any further hooks on it.
*/
vma->vm_ops = &vma_dummy_vm_ops;

return err;
}

/*
* If the VMA has a close hook then close it, and since closing it might leave
* it in an inconsistent state which makes the use of any hooks suspect, clear
* them down by installing dummy empty hooks.
*/
static inline void vma_close(struct vm_area_struct *vma)
{
if (vma->vm_ops && vma->vm_ops->close) {
vma->vm_ops->close(vma);

/*
* The mapping is in an inconsistent state, and no further hooks
* may be invoked upon it.
*/
vma->vm_ops = &vma_dummy_vm_ops;
}
}

#ifdef CONFIG_MMU

/* Flags for folio_pte_batch(). */
Expand Down Expand Up @@ -639,21 +684,21 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
#endif
}

void __folio_undo_large_rmappable(struct folio *folio);
static inline void folio_undo_large_rmappable(struct folio *folio)
bool __folio_unqueue_deferred_split(struct folio *folio);
static inline bool folio_unqueue_deferred_split(struct folio *folio)
{
if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
return;
return false;

/*
* At this point, there is no one trying to add the folio to
* deferred_list. If folio is not in deferred_list, it's safe
* to check without acquiring the split_queue_lock.
*/
if (data_race(list_empty(&folio->_deferred_list)))
return;
return false;

__folio_undo_large_rmappable(folio);
return __folio_unqueue_deferred_split(folio);
}

static inline struct folio *page_rmappable_folio(struct page *page)
Expand Down
25 changes: 25 additions & 0 deletions mm/memcontrol-v1.c
Original file line number Diff line number Diff line change
Expand Up @@ -848,6 +848,8 @@ static int mem_cgroup_move_account(struct folio *folio,
css_get(&to->css);
css_put(&from->css);

/* Warning should never happen, so don't worry about refcount non-0 */
WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = (unsigned long)to;

__folio_memcg_unlock(from);
Expand Down Expand Up @@ -1217,7 +1219,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
enum mc_target_type target_type;
union mc_target target;
struct folio *folio;
bool tried_split_before = false;

retry_pmd:
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
if (mc.precharge < HPAGE_PMD_NR) {
Expand All @@ -1227,6 +1231,27 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
folio = target.folio;
/*
* Deferred split queue locking depends on memcg,
* and unqueue is unsafe unless folio refcount is 0:
* split or skip if on the queue? first try to split.
*/
if (!list_empty(&folio->_deferred_list)) {
spin_unlock(ptl);
if (!tried_split_before)
split_folio(folio);
folio_unlock(folio);
folio_put(folio);
if (tried_split_before)
return 0;
tried_split_before = true;
goto retry_pmd;
}
/*
* So long as that pmd lock is held, the folio cannot
* be racily added to the _deferred_list, because
* __folio_remove_rmap() will find !partially_mapped.
*/
if (folio_isolate_lru(folio)) {
if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) {
Expand Down
9 changes: 5 additions & 4 deletions mm/memcontrol.c
Original file line number Diff line number Diff line change
Expand Up @@ -4629,10 +4629,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
struct obj_cgroup *objcg;

VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
!folio_test_hugetlb(folio) &&
!list_empty(&folio->_deferred_list) &&
folio_test_partially_mapped(folio), folio);

/*
* Nobody should be changing or seriously looking at
Expand Down Expand Up @@ -4679,6 +4675,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
ug->nr_memory += nr_pages;
ug->pgpgout++;

WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = 0;
}

Expand Down Expand Up @@ -4790,6 +4787,9 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)

/* Transfer the charge and the css ref */
commit_charge(new, memcg);

/* Warning should never happen, so don't worry about refcount non-0 */
WARN_ON_ONCE(folio_unqueue_deferred_split(old));
old->memcg_data = 0;
}

Expand Down Expand Up @@ -4976,6 +4976,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);

folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;

if (!mem_cgroup_is_root(memcg))
Expand Down
Loading

0 comments on commit 48901e9

Please sign in to comment.