diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 64152dc6f2933..7d639a996f287 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -849,6 +849,98 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } +static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, + struct io_imu_folio_data *data, int nr_folios) +{ + struct page **page_array = *pages, **new_array = NULL; + int nr_pages_left = *nr_pages, i, j; + + /* Store head pages only*/ + new_array = kvmalloc_array(nr_folios, sizeof(struct page *), + GFP_KERNEL); + if (!new_array) + return false; + + new_array[0] = compound_head(page_array[0]); + /* + * The pages are bound to the folio, it doesn't + * actually unpin them but drops all but one reference, + * which is usually put down by io_buffer_unmap(). + * Note, needs a better helper. + */ + if (data->nr_pages_head > 1) + unpin_user_pages(&page_array[1], data->nr_pages_head - 1); + + j = data->nr_pages_head; + nr_pages_left -= data->nr_pages_head; + for (i = 1; i < nr_folios; i++) { + unsigned int nr_unpin; + + new_array[i] = page_array[j]; + nr_unpin = min_t(unsigned int, nr_pages_left - 1, + data->nr_pages_mid - 1); + if (nr_unpin) + unpin_user_pages(&page_array[j+1], nr_unpin); + j += data->nr_pages_mid; + nr_pages_left -= data->nr_pages_mid; + } + kvfree(page_array); + *pages = new_array; + *nr_pages = nr_folios; + return true; +} + +static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, + struct io_imu_folio_data *data) +{ + struct page **page_array = *pages; + struct folio *folio = page_folio(page_array[0]); + unsigned int count = 1, nr_folios = 1; + int i; + + if (*nr_pages <= 1) + return false; + + data->nr_pages_mid = folio_nr_pages(folio); + if (data->nr_pages_mid == 1) + return false; + + data->folio_shift = folio_shift(folio); + /* + * Check if pages are contiguous inside a folio, and all folios have + * the same page count except for the head and tail. + */ + for (i = 1; i < *nr_pages; i++) { + if (page_folio(page_array[i]) == folio && + page_array[i] == page_array[i-1] + 1) { + count++; + continue; + } + + if (nr_folios == 1) { + if (folio_page_idx(folio, page_array[i-1]) != + data->nr_pages_mid - 1) + return false; + + data->nr_pages_head = count; + } else if (count != data->nr_pages_mid) { + return false; + } + + folio = page_folio(page_array[i]); + if (folio_size(folio) != (1UL << data->folio_shift) || + folio_page_idx(folio, page_array[i]) != 0) + return false; + + count = 1; + nr_folios++; + } + if (nr_folios == 1) + data->nr_pages_head = count; + + return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); +} + static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage) @@ -858,7 +950,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, unsigned long off; size_t size; int ret, nr_pages, i; - struct folio *folio = NULL; + struct io_imu_folio_data data; + bool coalesced; *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; if (!iov->iov_base) @@ -873,31 +966,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, goto done; } - /* If it's a huge page, try to coalesce them into a single bvec entry */ - if (nr_pages > 1) { - folio = page_folio(pages[0]); - for (i = 1; i < nr_pages; i++) { - /* - * Pages must be consecutive and on the same folio for - * this to work - */ - if (page_folio(pages[i]) != folio || - pages[i] != pages[i - 1] + 1) { - folio = NULL; - break; - } - } - if (folio) { - /* - * The pages are bound to the folio, it doesn't - * actually unpin them but drops all but one reference, - * which is usually put down by io_buffer_unmap(). - * Note, needs a better helper. - */ - unpin_user_pages(&pages[1], nr_pages - 1); - nr_pages = 1; - } - } + /* If it's huge page(s), try to coalesce them into fewer bvec entries */ + coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data); imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) @@ -909,7 +979,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, goto done; } - off = (unsigned long) iov->iov_base & ~PAGE_MASK; size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; @@ -917,17 +986,18 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; imu->folio_mask = PAGE_MASK; + if (coalesced) { + imu->folio_shift = data.folio_shift; + imu->folio_mask = ~((1UL << data.folio_shift) - 1); + } + off = (unsigned long) iov->iov_base & ~imu->folio_mask; *pimu = imu; ret = 0; - if (folio) { - bvec_set_page(&imu->bvec[0], pages[0], size, off); - goto done; - } for (i = 0; i < nr_pages; i++) { size_t vec_len; - vec_len = min_t(size_t, size, PAGE_SIZE - off); + vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); off = 0; size -= vec_len; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index ee77e53328bfa..18242b2e9da49 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -52,6 +52,14 @@ struct io_mapped_ubuf { struct bio_vec bvec[] __counted_by(nr_bvecs); }; +struct io_imu_folio_data { + /* Head folio can be partially included in the fixed buf */ + unsigned int nr_pages_head; + /* For non-head/tail folios, has to be fully included */ + unsigned int nr_pages_mid; + unsigned int folio_shift; +}; + void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);