Skip to content

Commit

Permalink
Merge branch 'for-5.17/struct-slab' into for-linus
Browse files Browse the repository at this point in the history
Series "Separate struct slab from struct page" v4

This is originally an offshoot of the folio work by Matthew. One of the more
complex parts of the struct page definition are the parts used by the slab
allocators. It would be good for the MM in general if struct slab were its own
data type, and it also helps to prevent tail pages from slipping in anywhere.
As Matthew requested in his proof of concept series, I have taken over the
development of this series, so it's a mix of patches from him (often modified
by me) and my own.

One big difference is the use of coccinelle to perform the relatively trivial
parts of the conversions automatically and at once, instead of a larger number
of smaller incremental reviewable steps. Thanks to Julia Lawall and Luis
Chamberlain for all their help!

Another notable difference is (based also on review feedback) I don't represent
with a struct slab the large kmalloc allocations which are not really a slab,
but use page allocator directly. When going from an object address to a struct
slab, the code tests first folio slab flag, and only if it's set it converts to
struct slab. This makes the struct slab type stronger.

Finally, although Matthew's version didn't use any of the folio work, the
initial support has been merged meanwhile so my version builds on top of it
where appropriate. This eliminates some of the redundant compound_head()
being performed e.g. when testing the slab flag.

To sum up, after this series, struct page fields used by slab allocators are
moved from struct page to a new struct slab, that uses the same physical
storage. The availability of the fields is further distinguished by the
selected slab allocator implementation. The advantages include:

- Similar to folios, if the slab is of order > 0, struct slab always is
  guaranteed to be the head page. Additionally it's guaranteed to be an actual
  slab page, not a large kmalloc. This removes uncertainty and potential for
  bugs.
- It's not possible to accidentally use fields of the slab implementation that's
  not configured.
- Other subsystems cannot use slab's fields in struct page anymore (some
  existing non-slab usages had to be adjusted in this series), so slab
  implementations have more freedom in rearranging them in the struct slab.

Link: https://lore.kernel.org/all/20220104001046.12263-1-vbabka@suse.cz/
  • Loading branch information
Vlastimil Babka committed Jan 7, 2022
2 parents eb52c0f + b01af5c commit 9d6c59c
Show file tree
Hide file tree
Showing 27 changed files with 1,264 additions and 1,062 deletions.
2 changes: 1 addition & 1 deletion arch/x86/mm/init_64.c
Original file line number Diff line number Diff line change
Expand Up @@ -981,7 +981,7 @@ static void __meminit free_pagetable(struct page *page, int order)
if (PageReserved(page)) {
__ClearPageReserved(page);

magic = (unsigned long)page->freelist;
magic = page->index;
if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
while (nr_pages--)
put_page_bootmem(page++);
Expand Down
2 changes: 1 addition & 1 deletion include/linux/bootmem_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ void put_page_bootmem(struct page *page);
*/
static inline void free_bootmem_page(struct page *page)
{
unsigned long magic = (unsigned long)page->freelist;
unsigned long magic = page->index;

/*
* The reserve_bootmem_region sets the reserved flag on bootmem
Expand Down
9 changes: 5 additions & 4 deletions include/linux/kasan.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

struct kmem_cache;
struct page;
struct slab;
struct vm_struct;
struct task_struct;

Expand Down Expand Up @@ -193,11 +194,11 @@ static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache)
return 0;
}

void __kasan_poison_slab(struct page *page);
static __always_inline void kasan_poison_slab(struct page *page)
void __kasan_poison_slab(struct slab *slab);
static __always_inline void kasan_poison_slab(struct slab *slab)
{
if (kasan_enabled())
__kasan_poison_slab(page);
__kasan_poison_slab(slab);
}

void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
Expand Down Expand Up @@ -322,7 +323,7 @@ static inline void kasan_cache_create(struct kmem_cache *cache,
slab_flags_t *flags) {}
static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {}
static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
static inline void kasan_poison_slab(struct page *page) {}
static inline void kasan_poison_slab(struct slab *slab) {}
static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
void *object) {}
static inline void kasan_poison_object_data(struct kmem_cache *cache,
Expand Down
48 changes: 0 additions & 48 deletions include/linux/memcontrol.h
Original file line number Diff line number Diff line change
Expand Up @@ -536,61 +536,13 @@ static inline bool folio_memcg_kmem(struct folio *folio)
return folio->memcg_data & MEMCG_DATA_KMEM;
}

/*
* page_objcgs - get the object cgroups vector associated with a page
* @page: a pointer to the page struct
*
* Returns a pointer to the object cgroups vector associated with the page,
* or NULL. This function assumes that the page is known to have an
* associated object cgroups vector. It's not safe to call this function
* against pages, which might have an associated memory cgroup: e.g.
* kernel stack pages.
*/
static inline struct obj_cgroup **page_objcgs(struct page *page)
{
unsigned long memcg_data = READ_ONCE(page->memcg_data);

VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page);
VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);

return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}

/*
* page_objcgs_check - get the object cgroups vector associated with a page
* @page: a pointer to the page struct
*
* Returns a pointer to the object cgroups vector associated with the page,
* or NULL. This function is safe to use if the page can be directly associated
* with a memory cgroup.
*/
static inline struct obj_cgroup **page_objcgs_check(struct page *page)
{
unsigned long memcg_data = READ_ONCE(page->memcg_data);

if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS))
return NULL;

VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);

return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}

#else
static inline bool folio_memcg_kmem(struct folio *folio)
{
return false;
}

static inline struct obj_cgroup **page_objcgs(struct page *page)
{
return NULL;
}

static inline struct obj_cgroup **page_objcgs_check(struct page *page)
{
return NULL;
}
#endif

static inline bool PageMemcgKmem(struct page *page)
Expand Down
12 changes: 12 additions & 0 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,13 @@ static inline struct page *virt_to_head_page(const void *x)
return compound_head(page);
}

static inline struct folio *virt_to_folio(const void *x)
{
struct page *page = virt_to_page(x);

return page_folio(page);
}

void __put_page(struct page *page);

void put_pages_list(struct list_head *pages);
Expand Down Expand Up @@ -1753,6 +1760,11 @@ void page_address_init(void);
#define page_address_init() do { } while(0)
#endif

static inline void *folio_address(const struct folio *folio)
{
return page_address(&folio->page);
}

extern void *page_rmapping(struct page *page);
extern struct anon_vma *page_anon_vma(struct page *page);
extern pgoff_t __page_file_index(struct page *page);
Expand Down
10 changes: 5 additions & 5 deletions include/linux/mm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@ struct mem_cgroup;
* in each subpage, but you may need to restore some of their values
* afterwards.
*
* SLUB uses cmpxchg_double() to atomically update its freelist and
* counters. That requires that freelist & counters be adjacent and
* double-word aligned. We align all struct pages to double-word
* boundaries, and ensure that 'freelist' is aligned within the
* struct.
* SLUB uses cmpxchg_double() to atomically update its freelist and counters.
* That requires that freelist & counters in struct slab be adjacent and
* double-word aligned. Because struct slab currently just reinterprets the
* bits of struct page, we align all struct pages to double-word boundaries,
* and ensure that 'freelist' is aligned within struct slab.
*/
#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
#define _struct_page_alignment __aligned(2 * sizeof(unsigned long))
Expand Down
8 changes: 0 additions & 8 deletions include/linux/slab.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,14 +189,6 @@ bool kmem_valid_obj(void *object);
void kmem_dump_obj(void *object);
#endif

#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
bool to_user);
#else
static inline void __check_heap_object(const void *ptr, unsigned long n,
struct page *page, bool to_user) { }
#endif

/*
* Some archs want to perform DMA into kmalloc caches and need a guaranteed
* alignment larger than the alignment of a 64-bit integer.
Expand Down
16 changes: 8 additions & 8 deletions include/linux/slab_def.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,11 @@ struct kmem_cache {
struct kmem_cache_node *node[MAX_NUMNODES];
};

static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab,
void *x)
{
void *object = x - (x - page->s_mem) % cache->size;
void *last_object = page->s_mem + (cache->num - 1) * cache->size;
void *object = x - (x - slab->s_mem) % cache->size;
void *last_object = slab->s_mem + (cache->num - 1) * cache->size;

if (unlikely(object > last_object))
return last_object;
Expand All @@ -106,16 +106,16 @@ static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
* reciprocal_divide(offset, cache->reciprocal_buffer_size)
*/
static inline unsigned int obj_to_index(const struct kmem_cache *cache,
const struct page *page, void *obj)
const struct slab *slab, void *obj)
{
u32 offset = (obj - page->s_mem);
u32 offset = (obj - slab->s_mem);
return reciprocal_divide(offset, cache->reciprocal_buffer_size);
}

static inline int objs_per_slab_page(const struct kmem_cache *cache,
const struct page *page)
static inline int objs_per_slab(const struct kmem_cache *cache,
const struct slab *slab)
{
if (is_kfence_address(page_address(page)))
if (is_kfence_address(slab_address(slab)))
return 1;
return cache->num;
}
Expand Down
29 changes: 13 additions & 16 deletions include/linux/slub_def.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ enum stat_item {
struct kmem_cache_cpu {
void **freelist; /* Pointer to next available object */
unsigned long tid; /* Globally unique transaction id */
struct page *page; /* The slab from which we are allocating */
struct slab *slab; /* The slab from which we are allocating */
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct page *partial; /* Partially allocated frozen slabs */
struct slab *partial; /* Partially allocated frozen slabs */
#endif
local_lock_t lock; /* Protects the fields above */
#ifdef CONFIG_SLUB_STATS
Expand Down Expand Up @@ -99,8 +99,8 @@ struct kmem_cache {
#ifdef CONFIG_SLUB_CPU_PARTIAL
/* Number of per cpu partial objects to keep around */
unsigned int cpu_partial;
/* Number of per cpu partial pages to keep around */
unsigned int cpu_partial_pages;
/* Number of per cpu partial slabs to keep around */
unsigned int cpu_partial_slabs;
#endif
struct kmem_cache_order_objects oo;

Expand Down Expand Up @@ -156,16 +156,13 @@ static inline void sysfs_slab_release(struct kmem_cache *s)
}
#endif

void object_err(struct kmem_cache *s, struct page *page,
u8 *object, char *reason);

void *fixup_red_left(struct kmem_cache *s, void *p);

static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab,
void *x) {
void *object = x - (x - page_address(page)) % cache->size;
void *last_object = page_address(page) +
(page->objects - 1) * cache->size;
void *object = x - (x - slab_address(slab)) % cache->size;
void *last_object = slab_address(slab) +
(slab->objects - 1) * cache->size;
void *result = (unlikely(object > last_object)) ? last_object : object;

result = fixup_red_left(cache, result);
Expand All @@ -181,16 +178,16 @@ static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
}

static inline unsigned int obj_to_index(const struct kmem_cache *cache,
const struct page *page, void *obj)
const struct slab *slab, void *obj)
{
if (is_kfence_address(obj))
return 0;
return __obj_to_index(cache, page_address(page), obj);
return __obj_to_index(cache, slab_address(slab), obj);
}

static inline int objs_per_slab_page(const struct kmem_cache *cache,
const struct page *page)
static inline int objs_per_slab(const struct kmem_cache *cache,
const struct slab *slab)
{
return page->objects;
return slab->objects;
}
#endif /* _LINUX_SLUB_DEF_H */
7 changes: 3 additions & 4 deletions mm/bootmem_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,21 @@

void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
{
page->freelist = (void *)type;
page->index = type;
SetPagePrivate(page);
set_page_private(page, info);
page_ref_inc(page);
}

void put_page_bootmem(struct page *page)
{
unsigned long type;
unsigned long type = page->index;

type = (unsigned long) page->freelist;
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);

if (page_ref_dec_return(page) == 1) {
page->freelist = NULL;
page->index = 0;
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
Expand Down
27 changes: 15 additions & 12 deletions mm/kasan/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,9 @@ struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
}
#endif

void __kasan_poison_slab(struct page *page)
void __kasan_poison_slab(struct slab *slab)
{
struct page *page = slab_page(slab);
unsigned long i;

for (i = 0; i < compound_nr(page); i++)
Expand Down Expand Up @@ -298,7 +299,7 @@ static inline u8 assign_tag(struct kmem_cache *cache,
/* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
#ifdef CONFIG_SLAB
/* For SLAB assign tags based on the object index in the freelist. */
return (u8)obj_to_index(cache, virt_to_head_page(object), (void *)object);
return (u8)obj_to_index(cache, virt_to_slab(object), (void *)object);
#else
/*
* For SLUB assign a random tag during slab creation, otherwise reuse
Expand Down Expand Up @@ -341,7 +342,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
if (is_kfence_address(object))
return false;

if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
if (unlikely(nearest_obj(cache, virt_to_slab(object), object) !=
object)) {
kasan_report_invalid_free(tagged_object, ip);
return true;
Expand Down Expand Up @@ -401,22 +402,24 @@ void __kasan_kfree_large(void *ptr, unsigned long ip)

void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
{
struct page *page;
struct folio *folio;

page = virt_to_head_page(ptr);
folio = virt_to_folio(ptr);

/*
* Even though this function is only called for kmem_cache_alloc and
* kmalloc backed mempool allocations, those allocations can still be
* !PageSlab() when the size provided to kmalloc is larger than
* KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
*/
if (unlikely(!PageSlab(page))) {
if (unlikely(!folio_test_slab(folio))) {
if (____kasan_kfree_large(ptr, ip))
return;
kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false);
kasan_poison(ptr, folio_size(folio), KASAN_FREE_PAGE, false);
} else {
____kasan_slab_free(page->slab_cache, ptr, ip, false, false);
struct slab *slab = folio_slab(folio);

____kasan_slab_free(slab->slab_cache, ptr, ip, false, false);
}
}

Expand Down Expand Up @@ -560,7 +563,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,

void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flags)
{
struct page *page;
struct slab *slab;

if (unlikely(object == ZERO_SIZE_PTR))
return (void *)object;
Expand All @@ -572,13 +575,13 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
*/
kasan_unpoison(object, size, false);

page = virt_to_head_page(object);
slab = virt_to_slab(object);

/* Piggy-back on kmalloc() instrumentation to poison the redzone. */
if (unlikely(!PageSlab(page)))
if (unlikely(!slab))
return __kasan_kmalloc_large(object, size, flags);
else
return ____kasan_kmalloc(page->slab_cache, object, size, flags);
return ____kasan_kmalloc(slab->slab_cache, object, size, flags);
}

bool __kasan_check_byte(const void *address, unsigned long ip)
Expand Down
Loading

0 comments on commit 9d6c59c

Please sign in to comment.