diff --git a/include/linux/swap.h b/include/linux/swap.h index 517d24e96d8c..62fc7499b408 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -224,7 +224,6 @@ enum { #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* Bit flag in swap_map */ -#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ #define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ /* Special value in first swap_map */ diff --git a/mm/swap.h b/mm/swap.h index 393378ce1687..bfafa637c458 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -205,6 +205,11 @@ int folio_alloc_swap(struct folio *folio); int folio_dup_swap(struct folio *folio, struct page *subpage); void folio_put_swap(struct folio *folio, struct page *subpage); +/* For internal use */ +extern void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, unsigned int nr_pages); + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; @@ -256,14 +261,6 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, return folio_entry.val == round_down(entry.val, nr_pages); } -/* Temporary internal helpers */ -void __swapcache_set_cached(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry); -void __swapcache_clear_cached(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr); - /* * All swap cache helpers below require the caller to ensure the swap entries * used are valid and stabilize the device by any of the following ways: diff --git a/mm/swap_state.c b/mm/swap_state.c index 4f59770e5eb7..6d0eef7470be 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -210,17 +210,6 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, shadow = swp_tb_to_shadow(old_tb); offset++; } while (++ci_off < ci_end); - - ci_off = ci_start; - offset = swp_offset(entry); - do { - /* - * Still need to pin the slots with SWAP_HAS_CACHE since - * swap allocator depends on that. - */ - __swapcache_set_cached(si, ci, swp_entry(swp_type(entry), offset)); - offset++; - } while (++ci_off < ci_end); __swap_cache_add_folio(ci, folio, entry); swap_cluster_unlock(ci); if (shadowp) @@ -251,6 +240,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, struct swap_info_struct *si; unsigned long old_tb, new_tb; unsigned int ci_start, ci_off, ci_end; + bool folio_swapped = false, need_free = false; unsigned long nr_pages = folio_nr_pages(folio); VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); @@ -268,13 +258,27 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, old_tb = __swap_table_xchg(ci, ci_off, new_tb); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != folio); + if (__swap_count(swp_entry(si->type, + swp_offset(entry) + ci_off - ci_start))) + folio_swapped = true; + else + need_free = true; } while (++ci_off < ci_end); folio->swap.val = 0; folio_clear_swapcache(folio); node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); - __swapcache_clear_cached(si, ci, entry, nr_pages); + + if (!folio_swapped) { + swap_entries_free(si, ci, swp_offset(entry), nr_pages); + } else if (need_free) { + do { + if (!__swap_count(entry)) + swap_entries_free(si, ci, swp_offset(entry), 1); + entry.val++; + } while (--nr_pages); + } } /** diff --git a/mm/swapfile.c b/mm/swapfile.c index a41632e74787..5721018cb28a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -48,21 +48,18 @@ #include #include "swap_table.h" #include "internal.h" +#include "swap_table.h" #include "swap.h" static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long start, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr); static void swap_put_entry_locked(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long offset, - unsigned char usage); + unsigned long offset); static bool folio_swapcache_freeable(struct folio *folio); static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, @@ -149,11 +146,6 @@ static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry) return swap_type_to_info(swp_type(entry)); } -static inline unsigned char swap_count(unsigned char ent) -{ - return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ -} - /* * Use the second highest bit of inuse_pages counter as the indicator * if one swap device is on the available plist, so the atomic can @@ -185,15 +177,20 @@ static long swap_usage_in_pages(struct swap_info_struct *si) #define TTRS_FULL 0x4 static bool swap_only_has_cache(struct swap_info_struct *si, - unsigned long offset, int nr_pages) + struct swap_cluster_info *ci, + unsigned long offset, int nr_pages) { + unsigned int ci_off = offset % SWAPFILE_CLUSTER; unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; + unsigned long swp_tb; do { - VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); - if (*map != SWAP_HAS_CACHE) + swp_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb)); + if (*map) return false; + ++ci_off; } while (++map < map_end); return true; @@ -248,12 +245,12 @@ again: goto out_unlock; /* - * It's safe to delete the folio from swap cache only if the folio's - * swap_map is HAS_CACHE only, which means the slots have no page table + * It's safe to delete the folio from swap cache only if the folio + * is in swap cache with swap count == 0. The slots have no page table * reference or pending writeback, and can't be allocated to others. */ ci = swap_cluster_lock(si, offset); - need_reclaim = swap_only_has_cache(si, offset, nr_pages); + need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages); swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; @@ -779,7 +776,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si, spin_unlock(&ci->lock); do { - if (swap_count(READ_ONCE(map[offset]))) + if (READ_ONCE(map[offset])) break; swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); if (swp_tb_is_folio(swp_tb)) { @@ -809,7 +806,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si, */ for (offset = start; offset < end; offset++) { swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (swap_count(map[offset]) || !swp_tb_is_null(swp_tb)) + if (map[offset] || !swp_tb_is_null(swp_tb)) return false; } @@ -829,11 +826,10 @@ static bool cluster_scan_range(struct swap_info_struct *si, return true; do { - if (swap_count(map[offset])) + if (map[offset]) return false; swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); if (swp_tb_is_folio(swp_tb)) { - WARN_ON_ONCE(!(map[offset] & SWAP_HAS_CACHE)); if (!vm_swap_full()) return false; *need_reclaim = true; @@ -891,11 +887,6 @@ static bool cluster_alloc_range(struct swap_info_struct *si, if (likely(folio)) { order = folio_order(folio); nr_pages = 1 << order; - /* - * Pin the slot with SWAP_HAS_CACHE to satisfy swap_dup_entries. - * This is the legacy allocation behavior, will drop it very soon. - */ - memset(si->swap_map + offset, SWAP_HAS_CACHE, nr_pages); __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset)); } else if (IS_ENABLED(CONFIG_HIBERNATION)) { order = 0; @@ -1012,8 +1003,8 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) to_scan--; while (offset < end) { - if (!swap_count(READ_ONCE(map[offset])) && - swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))) { + if (!READ_ONCE(map[offset]) && + swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) { spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); @@ -1115,7 +1106,7 @@ new_cluster: * Scan only one fragment cluster is good enough. Order 0 * allocation will surely success, and large allocation * failure is not critical. Scanning one cluster still - * keeps the list rotated and reclaimed (for HAS_CACHE). + * keeps the list rotated and reclaimed (for clean swap cache). */ found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false); if (found) @@ -1450,8 +1441,8 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, do { swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); count = si->swap_map[offset]; - VM_WARN_ON(swap_count(count) < 1 || count == SWAP_MAP_BAD); - if (swap_count(count) == 1) { + VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD); + if (count == 1) { /* count == 1 and non-cached slots will be batch freed. */ if (!swp_tb_is_folio(swp_tb)) { if (!batch_start) @@ -1459,7 +1450,6 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, continue; } /* count will be 0 after put, slot can be reclaimed */ - VM_WARN_ON(!(count & SWAP_HAS_CACHE)); need_reclaim = true; } /* @@ -1468,7 +1458,7 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, * slots will be freed when folio is removed from swap cache * (__swap_cache_del_folio). */ - swap_put_entry_locked(si, ci, offset, 1); + swap_put_entry_locked(si, ci, offset); if (batch_start) { swap_entries_free(si, ci, batch_start, offset - batch_start); batch_start = SWAP_ENTRY_INVALID; @@ -1625,7 +1615,8 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) offset = swp_offset(entry); if (offset >= si->max) goto bad_offset; - if (data_race(!si->swap_map[swp_offset(entry)])) + if (data_race(!si->swap_map[swp_offset(entry)]) && + !swap_cache_has_folio(entry)) goto bad_free; return si; @@ -1646,21 +1637,12 @@ out: static void swap_put_entry_locked(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long offset, - unsigned char usage) + unsigned long offset) { unsigned char count; - unsigned char has_cache; count = si->swap_map[offset]; - - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; - - if (usage == SWAP_HAS_CACHE) { - VM_BUG_ON(!has_cache); - has_cache = 0; - } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { + if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { if (swap_count_continued(si, offset, count)) count = SWAP_MAP_MAX | COUNT_CONTINUED; @@ -1670,10 +1652,8 @@ static void swap_put_entry_locked(struct swap_info_struct *si, count--; } - usage = count | has_cache; - if (usage) - WRITE_ONCE(si->swap_map[offset], usage); - else + WRITE_ONCE(si->swap_map[offset], count); + if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))) swap_entries_free(si, ci, offset, 1); } @@ -1742,21 +1722,13 @@ put_out: return NULL; } -/* - * Check if it's the last ref of swap entry in the freeing path. - */ -static inline bool __maybe_unused swap_is_last_ref(unsigned char count) -{ - return (count == SWAP_HAS_CACHE) || (count == 1); -} - /* * Drop the last ref of swap entries, caller have to ensure all entries * belong to the same cgroup and cluster. */ -static void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, unsigned int nr_pages) +void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, unsigned int nr_pages) { swp_entry_t entry = swp_entry(si->type, offset); unsigned char *map = si->swap_map + offset; @@ -1769,7 +1741,7 @@ static void swap_entries_free(struct swap_info_struct *si, ci->count -= nr_pages; do { - VM_BUG_ON(!swap_is_last_ref(*map)); + VM_WARN_ON(*map > 1); *map = 0; } while (++map < map_end); @@ -1788,7 +1760,7 @@ int __swap_count(swp_entry_t entry) struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); - return swap_count(si->swap_map[offset]); + return si->swap_map[offset]; } /** @@ -1803,7 +1775,7 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) int count; ci = swap_cluster_lock(si, offset); - count = swap_count(si->swap_map[offset]); + count = si->swap_map[offset]; swap_cluster_unlock(ci); return count && count != SWAP_MAP_BAD; @@ -1830,7 +1802,7 @@ int swp_swapcount(swp_entry_t entry) ci = swap_cluster_lock(si, offset); - count = swap_count(si->swap_map[offset]); + count = si->swap_map[offset]; if (!(count & COUNT_CONTINUED)) goto out; @@ -1868,12 +1840,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, ci = swap_cluster_lock(si, offset); if (nr_pages == 1) { - if (swap_count(map[roffset])) + if (map[roffset]) ret = true; goto unlock_out; } for (i = 0; i < nr_pages; i++) { - if (swap_count(map[offset + i])) { + if (map[offset + i]) { ret = true; break; } @@ -2027,7 +1999,7 @@ void swap_free_hibernation_slot(swp_entry_t entry) return; ci = swap_cluster_lock(si, offset); - swap_put_entry_locked(si, ci, offset, 1); + swap_put_entry_locked(si, ci, offset); swap_cluster_unlock(ci); /* In theory readahead might add it to the swap cache by accident */ @@ -2432,6 +2404,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev) { unsigned int i; + unsigned long swp_tb; unsigned char count; /* @@ -2442,7 +2415,11 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, */ for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); - if (count && swap_count(count) != SWAP_MAP_BAD) + swp_tb = swap_table_get(__swap_offset_to_cluster(si, i), + i % SWAPFILE_CLUSTER); + if (count == SWAP_MAP_BAD) + continue; + if (count || swp_tb_is_folio(swp_tb)) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); @@ -3667,8 +3644,7 @@ void si_swapinfo(struct sysinfo *val) * Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL - * - swap-cache reference is requested but there is already one. -> EEXIST - * - swap-cache reference is requested but the entry is not used. -> ENOENT + * - swap-mapped reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ static int swap_dup_entries(struct swap_info_struct *si, @@ -3677,39 +3653,30 @@ static int swap_dup_entries(struct swap_info_struct *si, unsigned char usage, int nr) { int i; - unsigned char count, has_cache; + unsigned char count; for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; - /* * For swapin out, allocator never allocates bad slots. for * swapin, readahead is guarded by swap_entry_swapped. */ - if (WARN_ON(swap_count(count) == SWAP_MAP_BAD)) + if (WARN_ON(count == SWAP_MAP_BAD)) return -ENOENT; - - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; - - if (!count && !has_cache) { + /* + * Swap count duplication must be guarded by either swap cache folio (from + * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct). + */ + if (WARN_ON(!count && + !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))) return -ENOENT; - } else if (usage == SWAP_HAS_CACHE) { - if (has_cache) - return -EEXIST; - } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) { + if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)) return -EINVAL; - } } for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; - - if (usage == SWAP_HAS_CACHE) - has_cache = SWAP_HAS_CACHE; - else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; else if (swap_count_continued(si, offset + i, count)) count = COUNT_CONTINUED; @@ -3721,7 +3688,7 @@ static int swap_dup_entries(struct swap_info_struct *si, return -ENOMEM; } - WRITE_ONCE(si->swap_map[offset + i], count | has_cache); + WRITE_ONCE(si->swap_map[offset + i], count); } return 0; @@ -3767,27 +3734,6 @@ int swap_dup_entry_direct(swp_entry_t entry) return err; } -/* Mark the swap map as HAS_CACHE, caller need to hold the cluster lock */ -void __swapcache_set_cached(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry) -{ - WARN_ON(swap_dup_entries(si, ci, swp_offset(entry), SWAP_HAS_CACHE, 1)); -} - -/* Clear the swap map as !HAS_CACHE, caller need to hold the cluster lock */ -void __swapcache_clear_cached(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr) -{ - if (swap_only_has_cache(si, swp_offset(entry), nr)) { - swap_entries_free(si, ci, swp_offset(entry), nr); - } else { - for (int i = 0; i < nr; i++, entry.val++) - swap_put_entry_locked(si, ci, swp_offset(entry), SWAP_HAS_CACHE); - } -} - /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's @@ -3833,7 +3779,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) ci = swap_cluster_lock(si, offset); - count = swap_count(si->swap_map[offset]); + count = si->swap_map[offset]; if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { /*