Skip to content

Commit

Permalink
mm/hmm: allow to mirror vma of a file on a DAX backed filesystem
Browse files Browse the repository at this point in the history
HMM mirror is a device driver helpers to mirror range of virtual address.
It means that the process jobs running on the device can access the same
virtual address as the CPU threads of that process.  This patch adds
support for mirroring mapping of file that are on a DAX block device (ie
range of virtual address that is an mmap of a file in a filesystem on a
DAX block device).  There is no reason to not support such case when
mirroring virtual address on a device.

Note that unlike GUP code we do not take page reference hence when we
back-off we have nothing to undo.

[jglisse@redhat.com: move THP and hugetlbfs code path behind #if KCONFIG]
  Link: http://lkml.kernel.org/r/20190422163741.13029-1-jglisse@redhat.com
Link: http://lkml.kernel.org/r/20190403193318.16478-10-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  • Loading branch information
Jérôme Glisse authored and Linus Torvalds committed May 14, 2019
1 parent 63d5066 commit 992de9a
Showing 1 changed file with 126 additions and 21 deletions.
147 changes: 126 additions & 21 deletions mm/hmm.c
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ EXPORT_SYMBOL(hmm_mirror_unregister);

struct hmm_vma_walk {
struct hmm_range *range;
struct dev_pagemap *pgmap;
unsigned long last;
bool fault;
bool block;
Expand Down Expand Up @@ -503,12 +504,22 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
range->flags[HMM_PFN_VALID];
}

static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
{
if (!pud_present(pud))
return 0;
return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
range->flags[HMM_PFN_WRITE] :
range->flags[HMM_PFN_VALID];
}

static int hmm_vma_handle_pmd(struct mm_walk *walk,
unsigned long addr,
unsigned long end,
uint64_t *pfns,
pmd_t pmd)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long pfn, npages, i;
Expand All @@ -524,10 +535,25 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);

pfn = pmd_pfn(pmd) + pte_index(addr);
for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
if (pmd_devmap(pmd)) {
hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
hmm_vma_walk->pgmap);
if (unlikely(!hmm_vma_walk->pgmap))
return -EBUSY;
}
pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
}
if (hmm_vma_walk->pgmap) {
put_dev_pagemap(hmm_vma_walk->pgmap);
hmm_vma_walk->pgmap = NULL;
}
hmm_vma_walk->last = end;
return 0;
#else
/* If THP is not enabled then we should never reach that code ! */
return -EINVAL;
#endif
}

static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
Expand Down Expand Up @@ -612,10 +638,24 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
if (fault || write_fault)
goto fault;

if (pte_devmap(pte)) {
hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
hmm_vma_walk->pgmap);
if (unlikely(!hmm_vma_walk->pgmap))
return -EBUSY;
} else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
*pfn = range->values[HMM_PFN_SPECIAL];
return -EFAULT;
}

*pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
return 0;

fault:
if (hmm_vma_walk->pgmap) {
put_dev_pagemap(hmm_vma_walk->pgmap);
hmm_vma_walk->pgmap = NULL;
}
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
Expand Down Expand Up @@ -703,12 +743,93 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
return r;
}
}
if (hmm_vma_walk->pgmap) {
/*
* We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
* so that we can leverage get_dev_pagemap() optimization which
* will not re-take a reference on a pgmap if we already have
* one.
*/
put_dev_pagemap(hmm_vma_walk->pgmap);
hmm_vma_walk->pgmap = NULL;
}
pte_unmap(ptep - 1);

hmm_vma_walk->last = addr;
return 0;
}

static int hmm_vma_walk_pud(pud_t *pudp,
unsigned long start,
unsigned long end,
struct mm_walk *walk)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long addr = start, next;
pmd_t *pmdp;
pud_t pud;
int ret;

again:
pud = READ_ONCE(*pudp);
if (pud_none(pud))
return hmm_vma_walk_hole(start, end, walk);

if (pud_huge(pud) && pud_devmap(pud)) {
unsigned long i, npages, pfn;
uint64_t *pfns, cpu_flags;
bool fault, write_fault;

if (!pud_present(pud))
return hmm_vma_walk_hole(start, end, walk);

i = (addr - range->start) >> PAGE_SHIFT;
npages = (end - addr) >> PAGE_SHIFT;
pfns = &range->pfns[i];

cpu_flags = pud_to_hmm_pfn_flags(range, pud);
hmm_range_need_fault(hmm_vma_walk, pfns, npages,
cpu_flags, &fault, &write_fault);
if (fault || write_fault)
return hmm_vma_walk_hole_(addr, end, fault,
write_fault, walk);

#ifdef CONFIG_HUGETLB_PAGE
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
for (i = 0; i < npages; ++i, ++pfn) {
hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
hmm_vma_walk->pgmap);
if (unlikely(!hmm_vma_walk->pgmap))
return -EBUSY;
pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
}
if (hmm_vma_walk->pgmap) {
put_dev_pagemap(hmm_vma_walk->pgmap);
hmm_vma_walk->pgmap = NULL;
}
hmm_vma_walk->last = end;
return 0;
#else
return -EINVAL;
#endif
}

split_huge_pud(walk->vma, pudp, addr);
if (pud_none(*pudp))
goto again;

pmdp = pmd_offset(pudp, addr);
do {
next = pmd_addr_end(addr, end);
ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
if (ret)
return ret;
} while (pmdp++, addr = next, addr != end);

return 0;
}

static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
unsigned long start, unsigned long end,
struct mm_walk *walk)
Expand Down Expand Up @@ -781,14 +902,6 @@ static void hmm_pfns_clear(struct hmm_range *range,
*pfns = range->values[HMM_PFN_NONE];
}

static void hmm_pfns_special(struct hmm_range *range)
{
unsigned long addr = range->start, i = 0;

for (; addr < range->end; addr += PAGE_SIZE, i++)
range->pfns[i] = range->values[HMM_PFN_SPECIAL];
}

/*
* hmm_range_register() - start tracking change to CPU page table over a range
* @range: range
Expand Down Expand Up @@ -906,12 +1019,6 @@ long hmm_range_snapshot(struct hmm_range *range)
if (vma == NULL || (vma->vm_flags & device_vma))
return -EFAULT;

/* FIXME support dax */
if (vma_is_dax(vma)) {
hmm_pfns_special(range);
return -EINVAL;
}

if (is_vm_hugetlb_page(vma)) {
struct hstate *h = hstate_vma(vma);

Expand All @@ -935,6 +1042,7 @@ long hmm_range_snapshot(struct hmm_range *range)
}

range->vma = vma;
hmm_vma_walk.pgmap = NULL;
hmm_vma_walk.last = start;
hmm_vma_walk.fault = false;
hmm_vma_walk.range = range;
Expand All @@ -946,6 +1054,7 @@ long hmm_range_snapshot(struct hmm_range *range)
mm_walk.pte_entry = NULL;
mm_walk.test_walk = NULL;
mm_walk.hugetlb_entry = NULL;
mm_walk.pud_entry = hmm_vma_walk_pud;
mm_walk.pmd_entry = hmm_vma_walk_pmd;
mm_walk.pte_hole = hmm_vma_walk_hole;
mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
Expand Down Expand Up @@ -1011,12 +1120,6 @@ long hmm_range_fault(struct hmm_range *range, bool block)
if (vma == NULL || (vma->vm_flags & device_vma))
return -EFAULT;

/* FIXME support dax */
if (vma_is_dax(vma)) {
hmm_pfns_special(range);
return -EINVAL;
}

if (is_vm_hugetlb_page(vma)) {
if (huge_page_shift(hstate_vma(vma)) !=
range->page_shift &&
Expand All @@ -1039,6 +1142,7 @@ long hmm_range_fault(struct hmm_range *range, bool block)
}

range->vma = vma;
hmm_vma_walk.pgmap = NULL;
hmm_vma_walk.last = start;
hmm_vma_walk.fault = true;
hmm_vma_walk.block = block;
Expand All @@ -1051,6 +1155,7 @@ long hmm_range_fault(struct hmm_range *range, bool block)
mm_walk.pte_entry = NULL;
mm_walk.test_walk = NULL;
mm_walk.hugetlb_entry = NULL;
mm_walk.pud_entry = hmm_vma_walk_pud;
mm_walk.pmd_entry = hmm_vma_walk_pmd;
mm_walk.pte_hole = hmm_vma_walk_hole;
mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
Expand Down

0 comments on commit 992de9a

Please sign in to comment.