diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fe91c293636..374bffc87427 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -284,12 +284,10 @@ enum swap_cluster_flags { #endif /* - * We assign a cluster to each CPU, so each CPU can allocate swap entry from - * its own cluster and swapout sequentially. The purpose is to optimize swapout - * throughput. + * We keep using same cluster for rotational device so IO will be sequential. + * The purpose is to optimize SWAP throughput on these device. */ -struct percpu_cluster { - local_lock_t lock; /* Protect the percpu_cluster above */ +struct swap_sequential_cluster { unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; @@ -315,8 +313,7 @@ struct swap_info_struct { atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ - struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ - struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ + struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ diff --git a/mm/swapfile.c b/mm/swapfile.c index db836670c334..8b296c4c636b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -116,6 +116,18 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0); atomic_t nr_rotate_swap = ATOMIC_INIT(0); +struct percpu_swap_cluster { + struct swap_info_struct *si[SWAP_NR_ORDERS]; + unsigned long offset[SWAP_NR_ORDERS]; + local_lock_t lock; +}; + +static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { + .si = { NULL }, + .offset = { SWAP_ENTRY_INVALID }, + .lock = INIT_LOCAL_LOCK(), +}; + static struct swap_info_struct *swap_type_to_swap_info(int type) { if (type >= MAX_SWAPFILES) @@ -539,7 +551,7 @@ static bool swap_do_scheduled_discard(struct swap_info_struct *si) ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); /* * Delete the cluster from list to prepare for discard, but keep - * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster + * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be * pointing to it, or ran into by relocate_cluster. */ list_del(&ci->list); @@ -805,10 +817,12 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, out: relocate_cluster(si, ci); unlock_cluster(ci); - if (si->flags & SWP_SOLIDSTATE) - __this_cpu_write(si->percpu_cluster->next[order], next); - else + if (si->flags & SWP_SOLIDSTATE) { + this_cpu_write(percpu_swap_cluster.offset[order], next); + this_cpu_write(percpu_swap_cluster.si[order], si); + } else { si->global_cluster->next[order] = next; + } return found; } @@ -862,20 +876,18 @@ static void swap_reclaim_work(struct work_struct *work) } /* - * Try to get swap entries with specified order from current cpu's swap entry - * pool (a cluster). This might involve allocating a new cluster for current CPU - * too. + * Try to allocate swap entries with specified order and try set a new + * cluster for current CPU too. */ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, unsigned char usage) { struct swap_cluster_info *ci; - unsigned int offset, found = 0; + unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; if (si->flags & SWP_SOLIDSTATE) { - /* Fast path using per CPU cluster */ - local_lock(&si->percpu_cluster->lock); - offset = __this_cpu_read(si->percpu_cluster->next[order]); + if (si == this_cpu_read(percpu_swap_cluster.si[order])) + offset = this_cpu_read(percpu_swap_cluster.offset[order]); } else { /* Serialize HDD SWAP allocation for each device. */ spin_lock(&si->global_cluster_lock); @@ -973,9 +985,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o } } done: - if (si->flags & SWP_SOLIDSTATE) - local_unlock(&si->percpu_cluster->lock); - else + if (!(si->flags & SWP_SOLIDSTATE)) spin_unlock(&si->global_cluster_lock); return found; } @@ -1196,6 +1206,51 @@ static bool get_swap_device_info(struct swap_info_struct *si) return true; } +/* + * Fast path try to get swap entries with specified order from current + * CPU's swap entry pool (a cluster). + */ +static int swap_alloc_fast(swp_entry_t entries[], + unsigned char usage, + int order, int n_goal) +{ + struct swap_cluster_info *ci; + struct swap_info_struct *si; + unsigned int offset, found; + int n_ret = 0; + + n_goal = min(n_goal, SWAP_BATCH); + + /* + * Once allocated, swap_info_struct will never be completely freed, + * so checking it's liveness by get_swap_device_info is enough. + */ + si = this_cpu_read(percpu_swap_cluster.si[order]); + offset = this_cpu_read(percpu_swap_cluster.offset[order]); + if (!si || !offset || !get_swap_device_info(si)) + return 0; + + while (offset) { + ci = lock_cluster(si, offset); + if (!cluster_is_usable(ci, order)) { + unlock_cluster(ci); + break; + } + if (cluster_is_empty(ci)) + offset = cluster_offset(si, ci); + found = alloc_swap_scan_cluster(si, ci, offset, order, usage); + if (!found) + break; + entries[n_ret++] = swp_entry(si->type, found); + if (n_ret == n_goal) + break; + offset = this_cpu_read(percpu_swap_cluster.offset[order]); + } + + put_swap_device(si); + return n_ret; +} + int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) { int order = swap_entry_order(entry_order); @@ -1204,19 +1259,36 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) int n_ret = 0; int node; + /* Fast path using percpu cluster */ + local_lock(&percpu_swap_cluster.lock); + n_ret = swap_alloc_fast(swp_entries, + SWAP_HAS_CACHE, + order, n_goal); + if (n_ret == n_goal) + goto out; + + n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH); + /* Rotate the device and switch to a new cluster */ spin_lock(&swap_avail_lock); start_over: node = numa_node_id(); plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { - /* requeue si to after same-priority siblings */ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, - n_goal, swp_entries, order); + /* + * For order 0 allocation, try best to fill the request + * as it's used by slot cache. + * + * For mTHP allocation, it always have n_goal == 1, + * and falling a mTHP swapin will just make the caller + * fallback to order 0 allocation, so just bail out. + */ + n_ret += scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, + swp_entries + n_ret, order); put_swap_device(si); if (n_ret || size > 1) - goto check_out; + goto out; } spin_lock(&swap_avail_lock); @@ -1234,12 +1306,10 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) if (plist_node_empty(&next->avail_lists[node])) goto start_over; } - spin_unlock(&swap_avail_lock); - -check_out: +out: + local_unlock(&percpu_swap_cluster.lock); atomic_long_sub(n_ret * size, &nr_swap_pages); - return n_ret; } @@ -2597,6 +2667,28 @@ static void wait_for_allocation(struct swap_info_struct *si) } } +/* + * Called after swap device's reference count is dead, so + * neither scan nor allocation will use it. + */ +static void flush_percpu_swap_cluster(struct swap_info_struct *si) +{ + int cpu, i; + struct swap_info_struct **pcp_si; + + for_each_possible_cpu(cpu) { + pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu); + /* + * Invalidate the percpu swap cluster cache, si->users + * is dead, so no new user will point to it, just flush + * any existing user. + */ + for (i = 0; i < SWAP_NR_ORDERS; i++) + cmpxchg(&pcp_si[i], si, NULL); + } +} + + SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; @@ -2698,6 +2790,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) flush_work(&p->discard_work); flush_work(&p->reclaim_work); + flush_percpu_swap_cluster(p); destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) @@ -2725,8 +2818,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) arch_swap_invalidate_area(p->type); zswap_swapoff(p->type); mutex_unlock(&swapon_mutex); - free_percpu(p->percpu_cluster); - p->percpu_cluster = NULL; kfree(p->global_cluster); p->global_cluster = NULL; vfree(swap_map); @@ -3125,7 +3216,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); struct swap_cluster_info *cluster_info; unsigned long i, j, idx; - int cpu, err = -ENOMEM; + int err = -ENOMEM; cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) @@ -3134,20 +3225,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); - if (si->flags & SWP_SOLIDSTATE) { - si->percpu_cluster = alloc_percpu(struct percpu_cluster); - if (!si->percpu_cluster) - goto err_free; - - for_each_possible_cpu(cpu) { - struct percpu_cluster *cluster; - - cluster = per_cpu_ptr(si->percpu_cluster, cpu); - for (i = 0; i < SWAP_NR_ORDERS; i++) - cluster->next[i] = SWAP_ENTRY_INVALID; - local_lock_init(&cluster->lock); - } - } else { + if (!(si->flags & SWP_SOLIDSTATE)) { si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL); if (!si->global_cluster) @@ -3424,8 +3502,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) bad_swap_unlock_inode: inode_unlock(inode); bad_swap: - free_percpu(si->percpu_cluster); - si->percpu_cluster = NULL; kfree(si->global_cluster); si->global_cluster = NULL; inode = NULL;