diff --git a/mm/mremap.c b/mm/mremap.c index 6305cb9a86f6..7dc058d5d5e2 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -884,13 +884,13 @@ static void vrm_stat_account(struct vma_remap_struct *vrm, * Perform checks before attempting to write a VMA prior to it being * moved. */ -static unsigned long prep_move_vma(struct vma_remap_struct *vrm, - unsigned long *vm_flags_ptr) +static unsigned long prep_move_vma(struct vma_remap_struct *vrm) { unsigned long err = 0; struct vm_area_struct *vma = vrm->vma; unsigned long old_addr = vrm->addr; unsigned long old_len = vrm->old_len; + unsigned long dummy = vma->vm_flags; /* * We'd prefer to avoid failure later on in do_munmap: @@ -916,56 +916,151 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm, * so KSM can come around to merge on vma and new_vma afterwards. */ err = ksm_madvise(vma, old_addr, old_addr + old_len, - MADV_UNMERGEABLE, vm_flags_ptr); + MADV_UNMERGEABLE, &dummy); if (err) return err; return 0; } -static unsigned long move_vma(struct vma_remap_struct *vrm) +/* + * Unmap source VMA for VMA move, turning it from a copy to a move, being + * careful to ensure we do not underflow memory account while doing so if an + * accountable move. + * + * This is best effort, if we fail to unmap then we simply try to correct + * accounting and exit. + */ +static void unmap_source_vma(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; + unsigned long addr = vrm->addr; + unsigned long len = vrm->old_len; struct vm_area_struct *vma = vrm->vma; - struct vm_area_struct *new_vma; - unsigned long vm_flags = vma->vm_flags; - unsigned long old_addr = vrm->addr, new_addr = vrm->new_addr; - unsigned long old_len = vrm->old_len, new_len = vrm->new_len; - unsigned long new_pgoff; - unsigned long moved_len; - unsigned long account_start = false; - unsigned long account_end = false; - unsigned long hiwater_vm; + VMA_ITERATOR(vmi, mm, addr); int err; - bool need_rmap_locks; - struct vma_iterator vmi; + unsigned long vm_start; + unsigned long vm_end; + /* + * It might seem odd that we check for MREMAP_DONTUNMAP here, given this + * function implies that we unmap the original VMA, which seems + * contradictory. + * + * However, this occurs when this operation was attempted and an error + * arose, in which case we _do_ wish to unmap the _new_ VMA, which means + * we actually _do_ want it be unaccounted. + */ + bool accountable_move = (vma->vm_flags & VM_ACCOUNT) && + !(vrm->flags & MREMAP_DONTUNMAP); - err = prep_move_vma(vrm, &vm_flags); - if (err) - return err; + /* + * So we perform a trick here to prevent incorrect accounting. Any merge + * or new VMA allocation performed in copy_vma() does not adjust + * accounting, it is expected that callers handle this. + * + * And indeed we already have, accounting appropriately in the case of + * both in vrm_charge(). + * + * However, when we unmap the existing VMA (to effect the move), this + * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount + * removed pages. + * + * To avoid this we temporarily clear this flag, reinstating on any + * portions of the original VMA that remain. + */ + if (accountable_move) { + vm_flags_clear(vma, VM_ACCOUNT); + /* We are about to split vma, so store the start/end. */ + vm_start = vma->vm_start; + vm_end = vma->vm_end; + } - /* If accounted, charge the number of bytes the operation will use. */ - if (!vrm_charge(vrm)) - return -ENOMEM; + err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false); + vrm->vma = NULL; /* Invalidated. */ + if (err) { + /* OOM: unable to split vma, just get accounts right */ + vm_acct_memory(len >> PAGE_SHIFT); + return; + } - vma_start_write(vma); - new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); - new_vma = copy_vma(&vrm->vma, new_addr, new_len, new_pgoff, + /* + * If we mremap() from a VMA like this: + * + * addr end + * | | + * v v + * |-------------| + * | | + * |-------------| + * + * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above + * we'll end up with: + * + * addr end + * | | + * v v + * |---| |---| + * | A | | B | + * |---| |---| + * + * The VMI is still pointing at addr, so vma_prev() will give us A, and + * a subsequent or lone vma_next() will give as B. + * + * do_vmi_munmap() will have restored the VMI back to addr. + */ + if (accountable_move) { + unsigned long end = addr + len; + + if (vm_start < addr) { + struct vm_area_struct *prev = vma_prev(&vmi); + + vm_flags_set(prev, VM_ACCOUNT); /* Acquires VMA lock. */ + } + + if (vm_end > end) { + struct vm_area_struct *next = vma_next(&vmi); + + vm_flags_set(next, VM_ACCOUNT); /* Acquires VMA lock. */ + } + } +} + +/* + * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the + * process. Additionally handle an error occurring on moving of page tables, + * where we reset vrm state to cause unmapping of the new VMA. + * + * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an + * error code. + */ +static int copy_vma_and_data(struct vma_remap_struct *vrm, + struct vm_area_struct **new_vma_ptr) +{ + unsigned long internal_offset = vrm->addr - vrm->vma->vm_start; + unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT; + unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff; + unsigned long moved_len; + bool need_rmap_locks; + struct vm_area_struct *vma; + struct vm_area_struct *new_vma; + int err = 0; + + new_vma = copy_vma(&vrm->vma, vrm->new_addr, vrm->new_len, new_pgoff, &need_rmap_locks); - /* This may have been updated. */ - vma = vrm->vma; if (!new_vma) { vrm_uncharge(vrm); + *new_vma_ptr = NULL; return -ENOMEM; } + vma = vrm->vma; - moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, - need_rmap_locks, false); - if (moved_len < old_len) { + moved_len = move_page_tables(vma, vrm->addr, new_vma, + vrm->new_addr, vrm->old_len, + need_rmap_locks, /* for_stack= */false); + if (moved_len < vrm->old_len) err = -ENOMEM; - } else if (vma->vm_ops && vma->vm_ops->mremap) { + else if (vma->vm_ops && vma->vm_ops->mremap) err = vma->vm_ops->mremap(new_vma); - } if (unlikely(err)) { /* @@ -973,28 +1068,84 @@ static unsigned long move_vma(struct vma_remap_struct *vrm) * which will succeed since page tables still there, * and then proceed to unmap new area instead of old. */ - move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, - true, false); - vma = new_vma; - old_len = new_len; - old_addr = new_addr; - new_addr = err; + move_page_tables(new_vma, vrm->new_addr, vma, vrm->addr, + moved_len, /* need_rmap_locks = */true, + /* for_stack= */false); + vrm->vma = new_vma; + vrm->old_len = vrm->new_len; + vrm->addr = vrm->new_addr; } else { mremap_userfaultfd_prep(new_vma, vrm->uf); } - if (is_vm_hugetlb_page(vma)) { + if (is_vm_hugetlb_page(vma)) clear_vma_resv_huge_pages(vma); - } - /* Conceal VM_ACCOUNT so old reservation is not undone */ - if (vm_flags & VM_ACCOUNT && !(vrm->flags & MREMAP_DONTUNMAP)) { - vm_flags_clear(vma, VM_ACCOUNT); - if (vma->vm_start < old_addr) - account_start = true; - if (vma->vm_end > old_addr + old_len) - account_end = true; - } + /* Tell pfnmap has moved from this vma */ + if (unlikely(vma->vm_flags & VM_PFNMAP)) + untrack_pfn_clear(vma); + + *new_vma_ptr = new_vma; + return err; +} + +/* + * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() and + * account flags on remaining VMA by convention (it cannot be mlock()'d any + * longer, as pages in range are no longer mapped), and removing anon_vma_chain + * links from it (if the entire VMA was copied over). + */ +static void dontunmap_complete(struct vma_remap_struct *vrm, + struct vm_area_struct *new_vma) +{ + unsigned long start = vrm->addr; + unsigned long end = vrm->addr + vrm->old_len; + unsigned long old_start = vrm->vma->vm_start; + unsigned long old_end = vrm->vma->vm_end; + + /* + * We always clear VM_LOCKED[ONFAULT] | VM_ACCOUNT on the old + * vma. + */ + vm_flags_clear(vrm->vma, VM_LOCKED_MASK | VM_ACCOUNT); + + /* + * anon_vma links of the old vma is no longer needed after its page + * table has been moved. + */ + if (new_vma != vrm->vma && start == old_start && end == old_end) + unlink_anon_vmas(vrm->vma); + + /* Because we won't unmap we don't need to touch locked_vm. */ +} + +static unsigned long move_vma(struct vma_remap_struct *vrm) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *new_vma; + unsigned long hiwater_vm; + int err; + + err = prep_move_vma(vrm); + if (err) + return err; + + /* If accounted, charge the number of bytes the operation will use. */ + if (!vrm_charge(vrm)) + return -ENOMEM; + + /* We don't want racing faults. */ + vma_start_write(vrm->vma); + + /* Perform copy step. */ + err = copy_vma_and_data(vrm, &new_vma); + /* + * If we established the copied-to VMA, we attempt to recover from the + * error by setting the destination VMA to the source VMA and unmapping + * it below. + */ + if (err && !new_vma) + return err; /* * If we failed to move page tables we still do total_vm increment @@ -1007,51 +1158,15 @@ static unsigned long move_vma(struct vma_remap_struct *vrm) */ hiwater_vm = mm->hiwater_vm; - /* Tell pfnmap has moved from this vma */ - if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn_clear(vma); - - if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP))) { - /* We always clear VM_LOCKED[ONFAULT] on the old vma */ - vm_flags_clear(vma, VM_LOCKED_MASK); - - /* - * anon_vma links of the old vma is no longer needed after its page - * table has been moved. - */ - if (new_vma != vma && vma->vm_start == old_addr && - vma->vm_end == (old_addr + old_len)) - unlink_anon_vmas(vma); - - /* Because we won't unmap we don't need to touch locked_vm */ - vrm_stat_account(vrm, new_len); - return new_addr; - } - - vrm_stat_account(vrm, new_len); - - vma_iter_init(&vmi, mm, old_addr); - if (do_vmi_munmap(&vmi, mm, old_addr, old_len, vrm->uf_unmap, false) < 0) { - /* OOM: unable to split vma, just get accounts right */ - if (vm_flags & VM_ACCOUNT && !(vrm->flags & MREMAP_DONTUNMAP)) - vm_acct_memory(old_len >> PAGE_SHIFT); - account_start = account_end = false; - } + vrm_stat_account(vrm, vrm->new_len); + if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP))) + dontunmap_complete(vrm, new_vma); + else + unmap_source_vma(vrm); mm->hiwater_vm = hiwater_vm; - /* Restore VM_ACCOUNT if one or two pieces of vma left */ - if (account_start) { - vma = vma_prev(&vmi); - vm_flags_set(vma, VM_ACCOUNT); - } - - if (account_end) { - vma = vma_next(&vmi); - vm_flags_set(vma, VM_ACCOUNT); - } - - return new_addr; + return err ? (unsigned long)err : vrm->new_addr; } /*