-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
powerpc/pseries: add RTAS work area allocator
Various pseries-specific RTAS functions take a temporary "work area" parameter - a buffer in memory accessible to RTAS. Typically such functions are passed the statically allocated rtas_data_buf buffer as the argument. This buffer is protected by a global spinlock. So users of rtas_data_buf cannot perform sleeping operations while accessing the buffer. Most RTAS functions that have a work area parameter can return a status (-2/990x) that indicates that the caller should retry. Before retrying, the caller may need to reschedule or sleep (see rtas_busy_delay() for details). This combination of factors leads to uncomfortable constructions like this: do { spin_lock(&rtas_data_buf_lock); rc = rtas_call(token, __pa(rtas_data_buf, ...); if (rc == 0) { /* parse or copy out rtas_data_buf contents */ } spin_unlock(&rtas_data_buf_lock); } while (rtas_busy_delay(rc)); Another unfortunately common way of handling this is for callers to blithely ignore the possibility of a -2/990x status and hope for the best. If users were allowed to perform blocking operations while owning a work area, the programming model would become less tedious and error-prone. Users could schedule away, sleep, or perform other blocking operations without having to release and re-acquire resources. We could continue to use a single work area buffer, and convert rtas_data_buf_lock to a mutex. But that would impose an unnecessarily coarse serialization on all users. As awkward as the current design is, it prevents longer running operations that need to repeatedly use rtas_data_buf from blocking the progress of others. There are more considerations. One is that while 4KB is fine for all current in-kernel uses, some RTAS calls can take much smaller buffers, and some (VPD, platform dumps) would likely benefit from larger ones. Another is that at least one RTAS function (ibm,get-vpd) has *two* work area parameters. And finally, we should expect the number of work area users in the kernel to increase over time as we introduce lockdown-compatible ABIs to replace less safe use cases based on sys_rtas/librtas. So a special-purpose allocator for RTAS work area buffers seems worth trying. Properties: * The backing memory for the allocator is reserved early in boot in order to satisfy RTAS addressing requirements, and then managed with genalloc. * Allocations can block, but they never fail (mempool-like). * Prioritizes first-come, first-serve fairness over throughput. * Early boot allocations before the allocator has been initialized are served via an internal static buffer. Intended to replace rtas_data_buf. New code that needs RTAS work area buffers should prefer this API. Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20230125-b4-powerpc-rtas-queue-v3-12-26929c8cce78@linux.ibm.com
- Loading branch information
Nathan Lynch
authored and
Michael Ellerman
committed
Feb 13, 2023
1 parent
24098f5
commit 43033bc
Showing
4 changed files
with
309 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
/* SPDX-License-Identifier: GPL-2.0-only */ | ||
#ifndef _ASM_POWERPC_RTAS_WORK_AREA_H | ||
#define _ASM_POWERPC_RTAS_WORK_AREA_H | ||
|
||
#include <linux/build_bug.h> | ||
#include <linux/sizes.h> | ||
#include <linux/types.h> | ||
|
||
#include <asm/page.h> | ||
|
||
/** | ||
* struct rtas_work_area - RTAS work area descriptor. | ||
* | ||
* Descriptor for a "work area" in PAPR terminology that satisfies | ||
* RTAS addressing requirements. | ||
*/ | ||
struct rtas_work_area { | ||
/* private: Use the APIs provided below. */ | ||
char *buf; | ||
size_t size; | ||
}; | ||
|
||
enum { | ||
/* Maximum allocation size, enforced at build time. */ | ||
RTAS_WORK_AREA_MAX_ALLOC_SZ = SZ_128K, | ||
}; | ||
|
||
/** | ||
* rtas_work_area_alloc() - Acquire a work area of the requested size. | ||
* @size_: Allocation size. Must be compile-time constant and not more | ||
* than %RTAS_WORK_AREA_MAX_ALLOC_SZ. | ||
* | ||
* Allocate a buffer suitable for passing to RTAS functions that have | ||
* a memory address parameter, often (but not always) referred to as a | ||
* "work area" in PAPR. Although callers are allowed to block while | ||
* holding a work area, the amount of memory reserved for this purpose | ||
* is limited, and allocations should be short-lived. A good guideline | ||
* is to release any allocated work area before returning from a | ||
* system call. | ||
* | ||
* This function does not fail. It blocks until the allocation | ||
* succeeds. To prevent deadlocks, callers are discouraged from | ||
* allocating more than one work area simultaneously in a single task | ||
* context. | ||
* | ||
* Context: This function may sleep. | ||
* Return: A &struct rtas_work_area descriptor for the allocated work area. | ||
*/ | ||
#define rtas_work_area_alloc(size_) ({ \ | ||
static_assert(__builtin_constant_p(size_)); \ | ||
static_assert((size_) > 0); \ | ||
static_assert((size_) <= RTAS_WORK_AREA_MAX_ALLOC_SZ); \ | ||
__rtas_work_area_alloc(size_); \ | ||
}) | ||
|
||
/* | ||
* Do not call __rtas_work_area_alloc() directly. Use | ||
* rtas_work_area_alloc(). | ||
*/ | ||
struct rtas_work_area *__rtas_work_area_alloc(size_t size); | ||
|
||
/** | ||
* rtas_work_area_free() - Release a work area. | ||
* @area: Work area descriptor as returned from rtas_work_area_alloc(). | ||
* | ||
* Return a work area buffer to the pool. | ||
*/ | ||
void rtas_work_area_free(struct rtas_work_area *area); | ||
|
||
static inline char *rtas_work_area_raw_buf(const struct rtas_work_area *area) | ||
{ | ||
return area->buf; | ||
} | ||
|
||
static inline size_t rtas_work_area_size(const struct rtas_work_area *area) | ||
{ | ||
return area->size; | ||
} | ||
|
||
static inline phys_addr_t rtas_work_area_phys(const struct rtas_work_area *area) | ||
{ | ||
return __pa(area->buf); | ||
} | ||
|
||
/* | ||
* Early setup for the work area allocator. Call from | ||
* rtas_initialize() only. | ||
*/ | ||
|
||
#ifdef CONFIG_PPC_PSERIES | ||
void rtas_work_area_reserve_arena(phys_addr_t limit); | ||
#else /* CONFIG_PPC_PSERIES */ | ||
static inline void rtas_work_area_reserve_arena(phys_addr_t limit) {} | ||
#endif /* CONFIG_PPC_PSERIES */ | ||
|
||
#endif /* _ASM_POWERPC_RTAS_WORK_AREA_H */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
// SPDX-License-Identifier: GPL-2.0-only | ||
|
||
#define pr_fmt(fmt) "rtas-work-area: " fmt | ||
|
||
#include <linux/genalloc.h> | ||
#include <linux/log2.h> | ||
#include <linux/kernel.h> | ||
#include <linux/memblock.h> | ||
#include <linux/mempool.h> | ||
#include <linux/minmax.h> | ||
#include <linux/mutex.h> | ||
#include <linux/numa.h> | ||
#include <linux/sizes.h> | ||
#include <linux/wait.h> | ||
|
||
#include <asm/machdep.h> | ||
#include <asm/rtas-work-area.h> | ||
#include <asm/rtas.h> | ||
|
||
enum { | ||
/* | ||
* Ensure the pool is page-aligned. | ||
*/ | ||
RTAS_WORK_AREA_ARENA_ALIGN = PAGE_SIZE, | ||
/* | ||
* Don't let a single allocation claim the whole arena. | ||
*/ | ||
RTAS_WORK_AREA_ARENA_SZ = RTAS_WORK_AREA_MAX_ALLOC_SZ * 2, | ||
/* | ||
* The smallest known work area size is for ibm,get-vpd's | ||
* location code argument, which is limited to 79 characters | ||
* plus 1 nul terminator. | ||
* | ||
* PAPR+ 7.3.20 ibm,get-vpd RTAS Call | ||
* PAPR+ 12.3.2.4 Converged Location Code Rules - Length Restrictions | ||
*/ | ||
RTAS_WORK_AREA_MIN_ALLOC_SZ = roundup_pow_of_two(80), | ||
}; | ||
|
||
static struct { | ||
struct gen_pool *gen_pool; | ||
char *arena; | ||
struct mutex mutex; /* serializes allocations */ | ||
struct wait_queue_head wqh; | ||
mempool_t descriptor_pool; | ||
bool available; | ||
} rwa_state = { | ||
.mutex = __MUTEX_INITIALIZER(rwa_state.mutex), | ||
.wqh = __WAIT_QUEUE_HEAD_INITIALIZER(rwa_state.wqh), | ||
}; | ||
|
||
/* | ||
* A single work area buffer and descriptor to serve requests early in | ||
* boot before the allocator is fully initialized. We know 4KB is the | ||
* most any boot time user needs (they all call ibm,get-system-parameter). | ||
*/ | ||
static bool early_work_area_in_use __initdata; | ||
static char early_work_area_buf[SZ_4K] __initdata __aligned(SZ_4K); | ||
static struct rtas_work_area early_work_area __initdata = { | ||
.buf = early_work_area_buf, | ||
.size = sizeof(early_work_area_buf), | ||
}; | ||
|
||
|
||
static struct rtas_work_area * __init rtas_work_area_alloc_early(size_t size) | ||
{ | ||
WARN_ON(size > early_work_area.size); | ||
WARN_ON(early_work_area_in_use); | ||
early_work_area_in_use = true; | ||
memset(early_work_area.buf, 0, early_work_area.size); | ||
return &early_work_area; | ||
} | ||
|
||
static void __init rtas_work_area_free_early(struct rtas_work_area *work_area) | ||
{ | ||
WARN_ON(work_area != &early_work_area); | ||
WARN_ON(!early_work_area_in_use); | ||
early_work_area_in_use = false; | ||
} | ||
|
||
struct rtas_work_area * __ref __rtas_work_area_alloc(size_t size) | ||
{ | ||
struct rtas_work_area *area; | ||
unsigned long addr; | ||
|
||
might_sleep(); | ||
|
||
/* | ||
* The rtas_work_area_alloc() wrapper enforces this at build | ||
* time. Requests that exceed the arena size will block | ||
* indefinitely. | ||
*/ | ||
WARN_ON(size > RTAS_WORK_AREA_MAX_ALLOC_SZ); | ||
|
||
if (!rwa_state.available) | ||
return rtas_work_area_alloc_early(size); | ||
/* | ||
* To ensure FCFS behavior and prevent a high rate of smaller | ||
* requests from starving larger ones, use the mutex to queue | ||
* allocations. | ||
*/ | ||
mutex_lock(&rwa_state.mutex); | ||
wait_event(rwa_state.wqh, | ||
(addr = gen_pool_alloc(rwa_state.gen_pool, size)) != 0); | ||
mutex_unlock(&rwa_state.mutex); | ||
|
||
area = mempool_alloc(&rwa_state.descriptor_pool, GFP_KERNEL); | ||
area->buf = (char *)addr; | ||
area->size = size; | ||
|
||
return area; | ||
} | ||
|
||
void __ref rtas_work_area_free(struct rtas_work_area *area) | ||
{ | ||
if (!rwa_state.available) { | ||
rtas_work_area_free_early(area); | ||
return; | ||
} | ||
|
||
gen_pool_free(rwa_state.gen_pool, (unsigned long)area->buf, area->size); | ||
mempool_free(area, &rwa_state.descriptor_pool); | ||
wake_up(&rwa_state.wqh); | ||
} | ||
|
||
/* | ||
* Initialization of the work area allocator happens in two parts. To | ||
* reliably reserve an arena that satisfies RTAS addressing | ||
* requirements, we must perform a memblock allocation early, | ||
* immmediately after RTAS instantiation. Then we have to wait until | ||
* the slab allocator is up before setting up the descriptor mempool | ||
* and adding the arena to a gen_pool. | ||
*/ | ||
static __init int rtas_work_area_allocator_init(void) | ||
{ | ||
const unsigned int order = ilog2(RTAS_WORK_AREA_MIN_ALLOC_SZ); | ||
const phys_addr_t pa_start = __pa(rwa_state.arena); | ||
const phys_addr_t pa_end = pa_start + RTAS_WORK_AREA_ARENA_SZ - 1; | ||
struct gen_pool *pool; | ||
const int nid = NUMA_NO_NODE; | ||
int err; | ||
|
||
err = -ENOMEM; | ||
if (!rwa_state.arena) | ||
goto err_out; | ||
|
||
pool = gen_pool_create(order, nid); | ||
if (!pool) | ||
goto err_out; | ||
/* | ||
* All RTAS functions that consume work areas are OK with | ||
* natural alignment, when they have alignment requirements at | ||
* all. | ||
*/ | ||
gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL); | ||
|
||
err = gen_pool_add(pool, (unsigned long)rwa_state.arena, | ||
RTAS_WORK_AREA_ARENA_SZ, nid); | ||
if (err) | ||
goto err_destroy; | ||
|
||
err = mempool_init_kmalloc_pool(&rwa_state.descriptor_pool, 1, | ||
sizeof(struct rtas_work_area)); | ||
if (err) | ||
goto err_destroy; | ||
|
||
rwa_state.gen_pool = pool; | ||
rwa_state.available = true; | ||
|
||
pr_debug("arena [%pa-%pa] (%uK), min/max alloc sizes %u/%u\n", | ||
&pa_start, &pa_end, | ||
RTAS_WORK_AREA_ARENA_SZ / SZ_1K, | ||
RTAS_WORK_AREA_MIN_ALLOC_SZ, | ||
RTAS_WORK_AREA_MAX_ALLOC_SZ); | ||
|
||
return 0; | ||
|
||
err_destroy: | ||
gen_pool_destroy(pool); | ||
err_out: | ||
return err; | ||
} | ||
machine_arch_initcall(pseries, rtas_work_area_allocator_init); | ||
|
||
/** | ||
* rtas_work_area_reserve_arena() - Reserve memory suitable for RTAS work areas. | ||
*/ | ||
void __init rtas_work_area_reserve_arena(const phys_addr_t limit) | ||
{ | ||
const phys_addr_t align = RTAS_WORK_AREA_ARENA_ALIGN; | ||
const phys_addr_t size = RTAS_WORK_AREA_ARENA_SZ; | ||
const phys_addr_t min = MEMBLOCK_LOW_LIMIT; | ||
const int nid = NUMA_NO_NODE; | ||
|
||
/* | ||
* Too early for a machine_is(pseries) check. But PAPR | ||
* effectively mandates that ibm,get-system-parameter is | ||
* present: | ||
* | ||
* R1–7.3.16–1. All platforms must support the System | ||
* Parameters option. | ||
* | ||
* So set up the arena if we find that, with a fallback to | ||
* ibm,configure-connector, just in case. | ||
*/ | ||
if (rtas_service_present("ibm,get-system-parameter") || | ||
rtas_service_present("ibm,configure-connector")) | ||
rwa_state.arena = memblock_alloc_try_nid(size, align, min, limit, nid); | ||
} |