From b2cd5ae693a3dc5b70a0f75fba96452c591a2047 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 4 Feb 2025 11:39:59 -0700 Subject: [PATCH 1/2] iomap: make buffered writes work with RWF_DONTCACHE Add iomap buffered write support for RWF_DONTCACHE. If RWF_DONTCACHE is set for a write, mark the folios being written as uncached. Then writeback completion will drop the pages. The write_iter handler simply kicks off writeback for the pages, and writeback completion will take care of the rest. Signed-off-by: "Darrick J. Wong" Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20250204184047.356762-2-axboe@kernel.dk Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/design.rst | 5 +++++ Documentation/filesystems/iomap/operations.rst | 2 ++ fs/iomap/buffered-io.c | 4 ++++ include/linux/iomap.h | 1 + 4 files changed, 12 insertions(+) diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst index b0d0188a095e..7b91546750f5 100644 --- a/Documentation/filesystems/iomap/design.rst +++ b/Documentation/filesystems/iomap/design.rst @@ -352,6 +352,11 @@ operations: ``IOMAP_NOWAIT`` is often set on behalf of ``IOCB_NOWAIT`` or ``RWF_NOWAIT``. + * ``IOMAP_DONTCACHE`` is set when the caller wishes to perform a + buffered file I/O and would like the kernel to drop the pagecache + after the I/O completes, if it isn't already being used by another + thread. + If it is necessary to read existing file contents from a `different `_ device or address range on a device, the filesystem should return that diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 2c7f5df9d8b0..584ff549f9a6 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -131,6 +131,8 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap: * ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``. + * ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``. + Internal per-Folio State ------------------------ diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d303e6c8900c..ea863c3cf510 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -603,6 +603,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; + if (iter->flags & IOMAP_DONTCACHE) + fgp |= FGP_DONTCACHE; fgp |= fgf_set_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, @@ -1034,6 +1036,8 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, if (iocb->ki_flags & IOCB_NOWAIT) iter.flags |= IOMAP_NOWAIT; + if (iocb->ki_flags & IOCB_DONTCACHE) + iter.flags |= IOMAP_DONTCACHE; while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_write_iter(&iter, i); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 75bf54e76f3b..26b0dbe23e62 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -183,6 +183,7 @@ struct iomap_folio_ops { #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ #define IOMAP_ATOMIC (1 << 9) +#define IOMAP_DONTCACHE (1 << 10) struct iomap_ops { /* From 974c5e6139db30fae668e44c381d13bcc63b65fa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 4 Feb 2025 11:40:00 -0700 Subject: [PATCH 2/2] xfs: flag as supporting FOP_DONTCACHE Read side was already fully supported, and with the write side appropriately punted to the worker queue, all that's needed now is setting FOP_DONTCACHE in the file_operations structure to enable full support for read and write uncached IO. This provides similar benefits to using RWF_DONTCACHE with reads. Testing buffered writes on 32 files: writing bs 65536, uncached 0 1s: 196035MB/sec 2s: 132308MB/sec 3s: 132438MB/sec 4s: 116528MB/sec 5s: 103898MB/sec 6s: 108893MB/sec 7s: 99678MB/sec 8s: 106545MB/sec 9s: 106826MB/sec 10s: 101544MB/sec 11s: 111044MB/sec 12s: 124257MB/sec 13s: 116031MB/sec 14s: 114540MB/sec 15s: 115011MB/sec 16s: 115260MB/sec 17s: 116068MB/sec 18s: 116096MB/sec where it's quite obvious where the page cache filled, and performance dropped from to about half of where it started, settling in at around 115GB/sec. Meanwhile, 32 kswapds were running full steam trying to reclaim pages. Running the same test with uncached buffered writes: writing bs 65536, uncached 1 1s: 198974MB/sec 2s: 189618MB/sec 3s: 193601MB/sec 4s: 188582MB/sec 5s: 193487MB/sec 6s: 188341MB/sec 7s: 194325MB/sec 8s: 188114MB/sec 9s: 192740MB/sec 10s: 189206MB/sec 11s: 193442MB/sec 12s: 189659MB/sec 13s: 191732MB/sec 14s: 190701MB/sec 15s: 191789MB/sec 16s: 191259MB/sec 17s: 190613MB/sec 18s: 191951MB/sec and the behavior is fully predictable, performing the same throughout even after the page cache would otherwise have fully filled with dirty data. It's also about 65% faster, and using half the CPU of the system compared to the normal buffered write. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20250204184047.356762-3-axboe@kernel.dk Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/xfs/xfs_file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f7a7d89c345e..358987b6e2f8 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1626,7 +1626,8 @@ const struct file_operations xfs_file_operations = { .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | - FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE, + FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, }; const struct file_operations xfs_dir_file_operations = {