Skip to content

Commit

Permalink
mm/hugetlb: unshare page tables during VMA split, not before
Browse files Browse the repository at this point in the history
commit 081056d upstream.

Currently, __split_vma() triggers hugetlb page table unsharing through
vm_ops->may_split().  This happens before the VMA lock and rmap locks are
taken - which is too early, it allows racing VMA-locked page faults in our
process and racing rmap walks from other processes to cause page tables to
be shared again before we actually perform the split.

Fix it by explicitly calling into the hugetlb unshare logic from
__split_vma() in the same place where THP splitting also happens.  At that
point, both the VMA and the rmap(s) are write-locked.

An annoying detail is that we can now call into the helper
hugetlb_unshare_pmds() from two different locking contexts:

1. from hugetlb_split(), holding:
    - mmap lock (exclusively)
    - VMA lock
    - file rmap lock (exclusively)
2. hugetlb_unshare_all_pmds(), which I think is designed to be able to
   call us with only the mmap lock held (in shared mode), but currently
   only runs while holding mmap lock (exclusively) and VMA lock

Backporting note:
This commit fixes a racy protection that was introduced in commit
b30c14c ("hugetlb: unshare some PMDs when splitting VMAs"); that
commit claimed to fix an issue introduced in 5.13, but it should actually
also go all the way back.

[jannh@google.com: v2]
  Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com
Fixes: 39dde65 ("[PATCH] shared page table for hugetlb page")
Signed-off-by: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>	[b30c14c: hugetlb: unshare some PMDs when splitting VMAs]
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[stable backport: code got moved from mmap.c to vma.c]
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  • Loading branch information
Jann Horn authored and Greg Kroah-Hartman committed Jun 27, 2025
1 parent 39c8683 commit af6cfcd
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 16 deletions.
3 changes: 3 additions & 0 deletions include/linux/hugetlb.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,

bool is_hugetlb_entry_migration(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);

#else /* !CONFIG_HUGETLB_PAGE */

Expand Down Expand Up @@ -491,6 +492,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,

static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }

static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}

#endif /* !CONFIG_HUGETLB_PAGE */
/*
* hugepages at page global directory. If arch support
Expand Down
60 changes: 44 additions & 16 deletions mm/hugetlb.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);

static inline bool subpool_is_free(struct hugepage_subpool *spool)
Expand Down Expand Up @@ -4903,26 +4903,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
return 0;
}

void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
{
/*
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
* This function is called in the middle of a VMA split operation, with
* MM, VMA and rmap all write-locked to prevent concurrent page table
* walks (except hardware and gup_fast()).
*/
vma_assert_write_locked(vma);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);

if (addr & ~PUD_MASK) {
/*
* hugetlb_vm_op_split is called right before we attempt to
* split the VMA. We will need to unshare PMDs in the old and
* new VMAs, so let's unshare before we split.
*/
unsigned long floor = addr & PUD_MASK;
unsigned long ceil = floor + PUD_SIZE;

if (floor >= vma->vm_start && ceil <= vma->vm_end)
hugetlb_unshare_pmds(vma, floor, ceil);
if (floor >= vma->vm_start && ceil <= vma->vm_end) {
/*
* Locking:
* Use take_locks=false here.
* The file rmap lock is already held.
* The hugetlb VMA lock can't be taken when we already
* hold the file rmap lock, and we don't need it because
* its purpose is to synchronize against concurrent page
* table walks, which are not possible thanks to the
* locks held by our caller.
*/
hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
}
}

return 0;
}

static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
Expand Down Expand Up @@ -7305,9 +7319,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
}
}

/*
* If @take_locks is false, the caller must ensure that no concurrent page table
* access can happen (except for gup_fast() and hardware page walks).
* If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
* concurrent page fault handling) and the file rmap lock.
*/
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start,
unsigned long end)
unsigned long end,
bool take_locks)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
Expand All @@ -7331,8 +7352,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
if (take_locks) {
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
} else {
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
}
for (address = start; address < end; address += PUD_SIZE) {
ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
Expand All @@ -7342,8 +7367,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
if (take_locks) {
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
}
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
Expand All @@ -7358,7 +7385,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
ALIGN_DOWN(vma->vm_end, PUD_SIZE));
ALIGN_DOWN(vma->vm_end, PUD_SIZE),
/* take_locks = */ true);
}

#ifdef CONFIG_CMA
Expand Down
6 changes: 6 additions & 0 deletions mm/mmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -2402,7 +2402,13 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
/*
* Get rid of huge pages and shared page tables straddling the split
* boundary.
*/
vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
if (is_vm_hugetlb_page(vma))
hugetlb_split(vma, addr);

if (new_below) {
vma->vm_start = addr;
Expand Down

0 comments on commit af6cfcd

Please sign in to comment.