Skip to content

Commit

Permalink
lib: add support for device coherent type in test_hmm
Browse files Browse the repository at this point in the history
Device Coherent type uses device memory that is coherently accesible by
the CPU.  This could be shown as SP (special purpose) memory range at the
BIOS-e820 memory enumeration.  If no SP memory is supported in system,
this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.

Currently, test_hmm only supports two different SP ranges of at least
256MB size. This could be specified in the kernel parameter variable
efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 &
0x140000000 physical address. Ex.
efi_fake_mem=1G@0x100000000:0x40000,1G@0x140000000:0x40000

Private and coherent device mirror instances can be created in the same
probed.  This is done by passing the module parameters spm_addr_dev0 &
spm_addr_dev1.  In this case, it will create four instances of
device_mirror.  The first two correspond to private device type, the last
two to coherent type.  Then, they can be easily accessed from user space
through /dev/hmm_mirror<num_device>.  Usually num_device 0 and 1 are for
private, and 2 and 3 for coherent types.  If no module parameters are
passed, two instances of private type device_mirror will be created only.

Link: https://lkml.kernel.org/r/20220715150521.18165-11-alex.sierra@amd.com
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Alistair Poppple <apopple@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  • Loading branch information
Alex Sierra authored and akpm committed Jul 18, 2022
1 parent 25b8016 commit 4c2e0f7
Show file tree
Hide file tree
Showing 2 changed files with 196 additions and 61 deletions.
253 changes: 192 additions & 61 deletions lib/test_hmm.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,22 @@

#include "test_hmm_uapi.h"

#define DMIRROR_NDEVICES 2
#define DMIRROR_NDEVICES 4
#define DMIRROR_RANGE_FAULT_TIMEOUT 1000
#define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U)
#define DEVMEM_CHUNKS_RESERVE 16

/*
* For device_private pages, dpage is just a dummy struct page
* representing a piece of device memory. dmirror_devmem_alloc_page
* allocates a real system memory page as backing storage to fake a
* real device. zone_device_data points to that backing page. But
* for device_coherent memory, the struct page represents real
* physical CPU-accessible memory that we can use directly.
*/
#define BACKING_PAGE(page) (is_device_private_page((page)) ? \
(page)->zone_device_data : (page))

static unsigned long spm_addr_dev0;
module_param(spm_addr_dev0, long, 0644);
MODULE_PARM_DESC(spm_addr_dev0,
Expand Down Expand Up @@ -125,6 +136,21 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce,
return 0;
}

static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
{
return (mdevice->zone_device_type ==
HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
}

static enum migrate_vma_direction
dmirror_select_device(struct dmirror *dmirror)
{
return (dmirror->mdevice->zone_device_type ==
HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
MIGRATE_VMA_SELECT_DEVICE_COHERENT;
}

static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
{
vfree(bounce->ptr);
Expand Down Expand Up @@ -575,16 +601,19 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
{
struct page *dpage = NULL;
struct page *rpage;
struct page *rpage = NULL;

/*
* This is a fake device so we alloc real system memory to store
* our device memory.
* For ZONE_DEVICE private type, this is a fake device so we allocate
* real system memory to store our device memory.
* For ZONE_DEVICE coherent type we use the actual dpage to store the
* data and ignore rpage.
*/
rpage = alloc_page(GFP_HIGHUSER);
if (!rpage)
return NULL;

if (dmirror_is_private_zone(mdevice)) {
rpage = alloc_page(GFP_HIGHUSER);
if (!rpage)
return NULL;
}
spin_lock(&mdevice->lock);

if (mdevice->free_pages) {
Expand All @@ -603,7 +632,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
return dpage;

error:
__free_page(rpage);
if (rpage)
__free_page(rpage);
return NULL;
}

Expand All @@ -629,12 +659,16 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
* unallocated pte_none() or read-only zero page.
*/
spage = migrate_pfn_to_page(*src);
if (WARN(spage && is_zone_device_page(spage),
"page already in device spage pfn: 0x%lx\n",
page_to_pfn(spage)))
continue;

dpage = dmirror_devmem_alloc_page(mdevice);
if (!dpage)
continue;

rpage = dpage->zone_device_data;
rpage = BACKING_PAGE(dpage);
if (spage)
copy_highpage(rpage, spage);
else
Expand All @@ -648,6 +682,8 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
*/
rpage->zone_device_data = dmirror;

pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
page_to_pfn(spage), page_to_pfn(dpage));
*dst = migrate_pfn(page_to_pfn(dpage));
if ((*src & MIGRATE_PFN_WRITE) ||
(!spage && args->vma->vm_flags & VM_WRITE))
Expand Down Expand Up @@ -725,11 +761,7 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
if (!dpage)
continue;

/*
* Store the page that holds the data so the page table
* doesn't have to deal with ZONE_DEVICE private pages.
*/
entry = dpage->zone_device_data;
entry = BACKING_PAGE(dpage);
if (*dst & MIGRATE_PFN_WRITE)
entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
Expand Down Expand Up @@ -815,15 +847,126 @@ static int dmirror_exclusive(struct dmirror *dmirror,
return ret;
}

static int dmirror_migrate(struct dmirror *dmirror,
struct hmm_dmirror_cmd *cmd)
static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
struct dmirror *dmirror)
{
const unsigned long *src = args->src;
unsigned long *dst = args->dst;
unsigned long start = args->start;
unsigned long end = args->end;
unsigned long addr;

for (addr = start; addr < end; addr += PAGE_SIZE,
src++, dst++) {
struct page *dpage, *spage;

spage = migrate_pfn_to_page(*src);
if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
continue;

if (WARN_ON(!is_device_private_page(spage) &&
!is_device_coherent_page(spage)))
continue;
spage = BACKING_PAGE(spage);
dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
if (!dpage)
continue;
pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
page_to_pfn(spage), page_to_pfn(dpage));

lock_page(dpage);
xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
copy_highpage(dpage, spage);
*dst = migrate_pfn(page_to_pfn(dpage));
if (*src & MIGRATE_PFN_WRITE)
*dst |= MIGRATE_PFN_WRITE;
}
return 0;
}

static unsigned long
dmirror_successful_migrated_pages(struct migrate_vma *migrate)
{
unsigned long cpages = 0;
unsigned long i;

for (i = 0; i < migrate->npages; i++) {
if (migrate->src[i] & MIGRATE_PFN_VALID &&
migrate->src[i] & MIGRATE_PFN_MIGRATE)
cpages++;
}
return cpages;
}

static int dmirror_migrate_to_system(struct dmirror *dmirror,
struct hmm_dmirror_cmd *cmd)
{
unsigned long start, end, addr;
unsigned long size = cmd->npages << PAGE_SHIFT;
struct mm_struct *mm = dmirror->notifier.mm;
struct vm_area_struct *vma;
unsigned long src_pfns[64];
unsigned long dst_pfns[64];
unsigned long src_pfns[64] = { 0 };
unsigned long dst_pfns[64] = { 0 };
struct migrate_vma args;
unsigned long next;
int ret;

start = cmd->addr;
end = start + size;
if (end < start)
return -EINVAL;

/* Since the mm is for the mirrored process, get a reference first. */
if (!mmget_not_zero(mm))
return -EINVAL;

cmd->cpages = 0;
mmap_read_lock(mm);
for (addr = start; addr < end; addr = next) {
vma = vma_lookup(mm, addr);
if (!vma || !(vma->vm_flags & VM_READ)) {
ret = -EINVAL;
goto out;
}
next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
if (next > vma->vm_end)
next = vma->vm_end;

args.vma = vma;
args.src = src_pfns;
args.dst = dst_pfns;
args.start = addr;
args.end = next;
args.pgmap_owner = dmirror->mdevice;
args.flags = dmirror_select_device(dmirror);

ret = migrate_vma_setup(&args);
if (ret)
goto out;

pr_debug("Migrating from device mem to sys mem\n");
dmirror_devmem_fault_alloc_and_copy(&args, dmirror);

migrate_vma_pages(&args);
cmd->cpages += dmirror_successful_migrated_pages(&args);
migrate_vma_finalize(&args);
}
out:
mmap_read_unlock(mm);
mmput(mm);

return ret;
}

static int dmirror_migrate_to_device(struct dmirror *dmirror,
struct hmm_dmirror_cmd *cmd)
{
unsigned long start, end, addr;
unsigned long size = cmd->npages << PAGE_SHIFT;
struct mm_struct *mm = dmirror->notifier.mm;
struct vm_area_struct *vma;
unsigned long src_pfns[64] = { 0 };
unsigned long dst_pfns[64] = { 0 };
struct dmirror_bounce bounce;
struct migrate_vma args;
unsigned long next;
Expand Down Expand Up @@ -860,6 +1003,7 @@ static int dmirror_migrate(struct dmirror *dmirror,
if (ret)
goto out;

pr_debug("Migrating from sys mem to device mem\n");
dmirror_migrate_alloc_and_copy(&args, dmirror);
migrate_vma_pages(&args);
dmirror_migrate_finalize_and_map(&args, dmirror);
Expand All @@ -868,7 +1012,10 @@ static int dmirror_migrate(struct dmirror *dmirror,
mmap_read_unlock(mm);
mmput(mm);

/* Return the migrated data for verification. */
/*
* Return the migrated data for verification.
* Only for pages in device zone
*/
ret = dmirror_bounce_init(&bounce, start, size);
if (ret)
return ret;
Expand Down Expand Up @@ -911,6 +1058,12 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL;
else
*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;
} else if (is_device_coherent_page(page)) {
/* Is the page migrated to this device or some other? */
if (dmirror->mdevice == dmirror_page_to_device(page))
*perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL;
else
*perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE;
} else if (is_zero_pfn(page_to_pfn(page)))
*perm = HMM_DMIRROR_PROT_ZERO;
else
Expand Down Expand Up @@ -1098,8 +1251,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
ret = dmirror_write(dmirror, &cmd);
break;

case HMM_DMIRROR_MIGRATE:
ret = dmirror_migrate(dmirror, &cmd);
case HMM_DMIRROR_MIGRATE_TO_DEV:
ret = dmirror_migrate_to_device(dmirror, &cmd);
break;

case HMM_DMIRROR_MIGRATE_TO_SYS:
ret = dmirror_migrate_to_system(dmirror, &cmd);
break;

case HMM_DMIRROR_EXCLUSIVE:
Expand Down Expand Up @@ -1161,58 +1318,25 @@ static const struct file_operations dmirror_fops = {

static void dmirror_devmem_free(struct page *page)
{
struct page *rpage = page->zone_device_data;
struct page *rpage = BACKING_PAGE(page);
struct dmirror_device *mdevice;

if (rpage)
if (rpage != page)
__free_page(rpage);

mdevice = dmirror_page_to_device(page);

spin_lock(&mdevice->lock);
mdevice->cfree++;
page->zone_device_data = mdevice->free_pages;
mdevice->free_pages = page;
spin_unlock(&mdevice->lock);
}

static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
struct dmirror *dmirror)
{
const unsigned long *src = args->src;
unsigned long *dst = args->dst;
unsigned long start = args->start;
unsigned long end = args->end;
unsigned long addr;

for (addr = start; addr < end; addr += PAGE_SIZE,
src++, dst++) {
struct page *dpage, *spage;

spage = migrate_pfn_to_page(*src);
if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
continue;
spage = spage->zone_device_data;

dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
if (!dpage)
continue;

lock_page(dpage);
xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
copy_highpage(dpage, spage);
*dst = migrate_pfn(page_to_pfn(dpage));
if (*src & MIGRATE_PFN_WRITE)
*dst |= MIGRATE_PFN_WRITE;
}
return 0;
}

static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
{
struct migrate_vma args;
unsigned long src_pfns;
unsigned long dst_pfns;
unsigned long src_pfns = 0;
unsigned long dst_pfns = 0;
struct page *rpage;
struct dmirror *dmirror;
vm_fault_t ret;
Expand All @@ -1232,7 +1356,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
args.src = &src_pfns;
args.dst = &dst_pfns;
args.pgmap_owner = dmirror->mdevice;
args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
args.flags = dmirror_select_device(dmirror);

if (migrate_vma_setup(&args))
return VM_FAULT_SIGBUS;
Expand Down Expand Up @@ -1311,6 +1435,12 @@ static int __init hmm_dmirror_init(void)
HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
dmirror_devices[ndevices++].zone_device_type =
HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
if (spm_addr_dev0 && spm_addr_dev1) {
dmirror_devices[ndevices++].zone_device_type =
HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
dmirror_devices[ndevices++].zone_device_type =
HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
}
for (id = 0; id < ndevices; id++) {
ret = dmirror_device_init(dmirror_devices + id, id);
if (ret)
Expand All @@ -1333,7 +1463,8 @@ static void __exit hmm_dmirror_exit(void)
int id;

for (id = 0; id < DMIRROR_NDEVICES; id++)
dmirror_device_remove(dmirror_devices + id);
if (dmirror_devices[id].zone_device_type)
dmirror_device_remove(dmirror_devices + id);
unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
}

Expand Down
Loading

0 comments on commit 4c2e0f7

Please sign in to comment.