Skip to content

Commit

Permalink
fs/dax: properly refcount fs dax pages
Browse files Browse the repository at this point in the history
Currently fs dax pages are considered free when the refcount drops to one
and their refcounts are not increased when mapped via PTEs or decreased
when unmapped.  This requires special logic in mm paths to detect that
these pages should not be properly refcounted, and to detect when the
refcount drops to one instead of zero.

On the other hand get_user_pages(), etc.  will properly refcount fs dax
pages by taking a reference and dropping it when the page is unpinned.

Tracking this special behaviour requires extra PTE bits (eg.  pte_devmap)
and introduces rules that are potentially confusing and specific to FS DAX
pages.  To fix this, and to possibly allow removal of the special PTE bits
in future, convert the fs dax page refcounts to be zero based and instead
take a reference on the page each time it is mapped as is currently the
case for normal pages.

This may also allow a future clean-up to remove the pgmap refcounting that
is currently done in mm/gup.c.

Link: https://lkml.kernel.org/r/c7d886ad7468a20452ef6e0ddab6cfe220874e7c.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  • Loading branch information
Alistair Popple authored and Andrew Morton committed Mar 18, 2025
1 parent 653d782 commit 38607c6
Show file tree
Hide file tree
Showing 14 changed files with 165 additions and 151 deletions.
4 changes: 1 addition & 3 deletions drivers/nvdimm/pmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ static int pmem_attach_disk(struct device *dev,

pmem->disk = disk;
pmem->pgmap.owner = pmem;
pmem->pfn_flags = PFN_DEV;
pmem->pfn_flags = 0;
if (is_nd_pfn(dev)) {
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
pmem->pgmap.ops = &fsdax_pagemap_ops;
Expand All @@ -522,7 +522,6 @@ static int pmem_attach_disk(struct device *dev,
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
pmem->pfn_pad = resource_size(res) -
range_len(&pmem->pgmap.range);
pmem->pfn_flags |= PFN_MAP;
bb_range = pmem->pgmap.range;
bb_range.start += pmem->data_offset;
} else if (pmem_should_map_pages(dev)) {
Expand All @@ -532,7 +531,6 @@ static int pmem_attach_disk(struct device *dev,
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
pmem->pgmap.ops = &fsdax_pagemap_ops;
addr = devm_memremap_pages(dev, &pmem->pgmap);
pmem->pfn_flags |= PFN_MAP;
bb_range = pmem->pgmap.range;
} else {
addr = devm_memremap(dev, pmem->phys_addr,
Expand Down
186 changes: 114 additions & 72 deletions fs/dax.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ static unsigned long dax_to_pfn(void *entry)
return xa_to_value(entry) >> DAX_SHIFT;
}

static struct folio *dax_to_folio(void *entry)
{
return page_folio(pfn_to_page(dax_to_pfn(entry)));
}

static void *dax_make_entry(pfn_t pfn, unsigned long flags)
{
return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
Expand Down Expand Up @@ -338,19 +343,6 @@ static unsigned long dax_entry_size(void *entry)
return PAGE_SIZE;
}

static unsigned long dax_end_pfn(void *entry)
{
return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
}

/*
* Iterate through all mapped pfns represented by an entry, i.e. skip
* 'empty' and 'zero' entries.
*/
#define for_each_mapped_pfn(entry, pfn) \
for (pfn = dax_to_pfn(entry); \
pfn < dax_end_pfn(entry); pfn++)

/*
* A DAX folio is considered shared if it has no mapping set and ->share (which
* shares the ->index field) is non-zero. Note this may return false even if the
Expand All @@ -359,7 +351,7 @@ static unsigned long dax_end_pfn(void *entry)
*/
static inline bool dax_folio_is_shared(struct folio *folio)
{
return !folio->mapping && folio->page.share;
return !folio->mapping && folio->share;
}

/*
Expand All @@ -384,75 +376,117 @@ static void dax_folio_make_shared(struct folio *folio)
* folio has previously been mapped into one address space so set the
* share count.
*/
folio->page.share = 1;
folio->share = 1;
}

static inline unsigned long dax_folio_share_put(struct folio *folio)
static inline unsigned long dax_folio_put(struct folio *folio)
{
return --folio->page.share;
unsigned long ref;
int order, i;

if (!dax_folio_is_shared(folio))
ref = 0;
else
ref = --folio->share;

if (ref)
return ref;

folio->mapping = NULL;
order = folio_order(folio);
if (!order)
return 0;

for (i = 0; i < (1UL << order); i++) {
struct dev_pagemap *pgmap = page_pgmap(&folio->page);
struct page *page = folio_page(folio, i);
struct folio *new_folio = (struct folio *)page;

ClearPageHead(page);
clear_compound_head(page);

new_folio->mapping = NULL;
/*
* Reset pgmap which was over-written by
* prep_compound_page().
*/
new_folio->pgmap = pgmap;
new_folio->share = 0;
WARN_ON_ONCE(folio_ref_count(new_folio));
}

return ref;
}

static void dax_folio_init(void *entry)
{
struct folio *folio = dax_to_folio(entry);
int order = dax_entry_order(entry);

/*
* Folio should have been split back to order-0 pages in
* dax_folio_put() when they were removed from their
* final mapping.
*/
WARN_ON_ONCE(folio_order(folio));

if (order > 0) {
prep_compound_page(&folio->page, order);
if (order > 1)
INIT_LIST_HEAD(&folio->_deferred_list);
WARN_ON_ONCE(folio_ref_count(folio));
}
}

static void dax_associate_entry(void *entry, struct address_space *mapping,
struct vm_area_struct *vma, unsigned long address, bool shared)
struct vm_area_struct *vma,
unsigned long address, bool shared)
{
unsigned long size = dax_entry_size(entry), pfn, index;
int i = 0;
unsigned long size = dax_entry_size(entry), index;
struct folio *folio = dax_to_folio(entry);

if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
return;

index = linear_page_index(vma, address & ~(size - 1));
for_each_mapped_pfn(entry, pfn) {
struct folio *folio = pfn_folio(pfn);

if (shared && (folio->mapping || folio->page.share)) {
if (folio->mapping)
dax_folio_make_shared(folio);
if (shared && (folio->mapping || dax_folio_is_shared(folio))) {
if (folio->mapping)
dax_folio_make_shared(folio);

WARN_ON_ONCE(!folio->page.share);
folio->page.share++;
} else {
WARN_ON_ONCE(folio->mapping);
folio->mapping = mapping;
folio->index = index + i++;
}
WARN_ON_ONCE(!folio->share);
WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio));
folio->share++;
} else {
WARN_ON_ONCE(folio->mapping);
dax_folio_init(entry);
folio = dax_to_folio(entry);
folio->mapping = mapping;
folio->index = index;
}
}

static void dax_disassociate_entry(void *entry, struct address_space *mapping,
bool trunc)
bool trunc)
{
unsigned long pfn;
struct folio *folio = dax_to_folio(entry);

if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
return;

for_each_mapped_pfn(entry, pfn) {
struct folio *folio = pfn_folio(pfn);

WARN_ON_ONCE(trunc && folio_ref_count(folio) > 1);
if (dax_folio_is_shared(folio)) {
/* keep the shared flag if this page is still shared */
if (dax_folio_share_put(folio) > 0)
continue;
} else
WARN_ON_ONCE(folio->mapping && folio->mapping != mapping);
folio->mapping = NULL;
folio->index = 0;
}
dax_folio_put(folio);
}

static struct page *dax_busy_page(void *entry)
{
unsigned long pfn;
struct folio *folio = dax_to_folio(entry);

for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
return NULL;

if (page_ref_count(page) > 1)
return page;
}
return NULL;
if (folio_ref_count(folio) - folio_mapcount(folio))
return &folio->page;
else
return NULL;
}

/**
Expand Down Expand Up @@ -785,7 +819,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
EXPORT_SYMBOL_GPL(dax_layout_busy_page);

static int __dax_invalidate_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
pgoff_t index, bool trunc)
{
XA_STATE(xas, &mapping->i_pages, index);
int ret = 0;
Expand Down Expand Up @@ -953,7 +987,8 @@ void dax_break_layout_final(struct inode *inode)
wait_page_idle_uninterruptible(page, inode);
} while (true);

dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
if (!page)
dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
}
EXPORT_SYMBOL_GPL(dax_break_layout_final);

Expand Down Expand Up @@ -1039,8 +1074,10 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
void *old;

dax_disassociate_entry(entry, mapping, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
shared);
if (!(flags & DAX_ZERO_PAGE))
dax_associate_entry(new_entry, mapping, vmf->vma,
vmf->address, shared);

/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
Expand Down Expand Up @@ -1228,9 +1265,7 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
goto out;
if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
goto out;
/* For larger pages we need devmap */
if (length > 1 && !pfn_t_devmap(*pfnp))
goto out;

rc = 0;

out_check_addr:
Expand Down Expand Up @@ -1337,7 +1372,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,

*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);

ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), false);
trace_dax_load_hole(inode, vmf, ret);
return ret;
}
Expand Down Expand Up @@ -1808,7 +1843,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
bool write = iter->flags & IOMAP_WRITE;
unsigned long entry_flags = pmd ? DAX_PMD : 0;
int err = 0;
struct folio *folio;
int ret, err = 0;
pfn_t pfn;
void *kaddr;

Expand Down Expand Up @@ -1840,17 +1876,19 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
return dax_fault_return(err);
}

folio = dax_to_folio(*entry);
if (dax_fault_is_synchronous(iter, vmf->vma))
return dax_fault_synchronous_pfnp(pfnp, pfn);

/* insert PMD pfn */
folio_ref_inc(folio);
if (pmd)
return vmf_insert_pfn_pmd(vmf, pfn, write);
ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn_t_to_pfn(pfn)),
write);
else
ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), write);
folio_put(folio);

/* insert PTE pfn */
if (write)
return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
return ret;
}

static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
Expand Down Expand Up @@ -2089,6 +2127,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
struct folio *folio;
void *entry;
vm_fault_t ret;

Expand All @@ -2106,14 +2145,17 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
dax_lock_entry(&xas, entry);
xas_unlock_irq(&xas);
folio = pfn_folio(pfn_t_to_pfn(pfn));
folio_ref_inc(folio);
if (order == 0)
ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
ret = vmf_insert_page_mkwrite(vmf, &folio->page, true);
#ifdef CONFIG_FS_DAX_PMD
else if (order == PMD_ORDER)
ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE);
#endif
else
ret = VM_FAULT_FALLBACK;
folio_put(folio);
dax_unlock_entry(&xas, entry);
trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
return ret;
Expand Down
3 changes: 1 addition & 2 deletions fs/fuse/virtio_fs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1017,8 +1017,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
if (kaddr)
*kaddr = fs->window_kaddr + offset;
if (pfn)
*pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
PFN_DEV | PFN_MAP);
*pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0);
return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
}

Expand Down
2 changes: 1 addition & 1 deletion include/linux/dax.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,

static inline bool dax_page_is_idle(struct page *page)
{
return page && page_ref_count(page) == 1;
return page && page_ref_count(page) == 0;
}

#if IS_ENABLED(CONFIG_DAX)
Expand Down
Loading

0 comments on commit 38607c6

Please sign in to comment.