ext4: Add support for blocksize < pagesize in dioread_nolock
This patch adds the support for blocksize < pagesize for
dioread_nolock feature.
Since in case of blocksize < pagesize, we can have multiple
small buffers of page as unwritten extents, we need to
maintain a vector of these unwritten extents which needs
the conversion after the IO is complete. Thus, we maintain
a list of tuple <offset, size> pair (io_end_vec) for this &
traverse this list to do the unwritten to written conversion.
Signed-off-by: Ritesh Harjani <riteshh@linux.ibm.com>
Link: https://lore.kernel.org/r/20191016073711.4141-5-riteshh@linux.ibm.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
BUG=b:144741353
TEST=xfstests/smoke, no new failures introduced
SOURCE=UPSTREAM(c8cc88163f40df39e50cda63ac361631864b453e)
Change-Id: I52caee29fe7949298f9b7fcdc2dd48cd1b09a9db
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/lakitu-kernel/+/2113634
Commit-Queue: Vaibhav Rustagi <vaibhavrustagi@google.com>
Commit-Queue: Harshad Shirwadkar <harshads@google.com>
Tested-by: Vaibhav Rustagi <vaibhavrustagi@google.com>
Reviewed-by: Roy Yang <royyang@google.com>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c77f217..6c73dd8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -207,6 +207,12 @@
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
+struct ext4_io_end_vec {
+ struct list_head list; /* list of io_end_vec */
+ loff_t offset; /* offset in the file */
+ ssize_t size; /* size of the extent */
+};
+
/*
* For converting unwritten extents on a work queue. 'handle' is used for
* buffered writeback.
@@ -220,8 +226,7 @@
* bios covering the extent */
unsigned int flag; /* unwritten or not */
atomic_t count; /* reference counter */
- loff_t offset; /* offset in the file */
- ssize_t size; /* size of the extent */
+ struct list_head list_vec; /* list of ext4_io_end_vec */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -3237,6 +3242,8 @@
int len,
struct writeback_control *wbc,
bool keep_towrite);
+extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
+extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8f8f488..dc47703 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5067,6 +5067,7 @@
int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
{
int ret, err = 0;
+ struct ext4_io_end_vec *io_end_vec;
/*
* This is somewhat ugly but the idea is clear: When transaction is
@@ -5080,8 +5081,14 @@
return PTR_ERR(handle);
}
- ret = ext4_convert_unwritten_extents(handle, io_end->inode,
- io_end->offset, io_end->size);
+ list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
+ ret = ext4_convert_unwritten_extents(handle, io_end->inode,
+ io_end_vec->offset,
+ io_end_vec->size);
+ if (ret)
+ break;
+ }
+
if (handle)
err = ext4_journal_stop(handle);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ced2fa8..f5b5f027 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2365,6 +2365,9 @@
ext4_lblk_t lblk = *m_lblk;
ext4_fsblk_t pblock = *m_pblk;
int err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ssize_t io_end_size = 0;
+ struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
bh = head = page_buffers(page);
do {
@@ -2377,17 +2380,16 @@
*/
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
- /*
- * FIXME: If dioread_nolock supports
- * blocksize < pagesize, we need to make
- * sure we add size mapped so far to
- * io_end->size as the following call
- * can submit the page for IO.
- */
err = mpage_process_page_bufs(mpd, head, bh, lblk);
if (err > 0)
err = 0;
+ if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ io_end_vec->offset = mpd->map.m_lblk << blkbits;
+ }
*map_bh = true;
goto out;
}
@@ -2396,13 +2398,11 @@
bh->b_blocknr = pblock++;
}
clear_buffer_unwritten(bh);
+ io_end_size += (1 << blkbits);
} while (lblk++, (bh = bh->b_this_page) != head);
- /*
- * FIXME: This is going to break if dioread_nolock
- * supports blocksize < pagesize as we will try to
- * convert potentially unmapped parts of inode.
- */
- io_end->size += PAGE_SIZE;
+
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
*map_bh = false;
out:
*m_lblk = lblk;
@@ -2556,9 +2556,10 @@
int err;
loff_t disksize;
int progress = 0;
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
+ struct ext4_io_end_vec *io_end_vec = ext4_alloc_io_end_vec(io_end);
- mpd->io_submit.io_end->offset =
- ((loff_t)map->m_lblk) << inode->i_blkbits;
+ io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
do {
err = mpage_map_one_extent(handle, mpd);
if (err < 0) {
@@ -3681,6 +3682,7 @@
ssize_t size, void *private)
{
ext4_io_end_t *io_end = private;
+ struct ext4_io_end_vec *io_end_vec;
/* if not async direct IO just return */
if (!io_end)
@@ -3698,8 +3700,9 @@
ext4_clear_io_unwritten_flag(io_end);
size = 0;
}
- io_end->offset = offset;
- io_end->size = size;
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ io_end_vec->offset = offset;
+ io_end_vec->size = size;
ext4_put_io_end(io_end);
return 0;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 99df733..92860e6 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -31,18 +31,56 @@
#include "acl.h"
static struct kmem_cache *io_end_cachep;
+static struct kmem_cache *io_end_vec_cachep;
int __init ext4_init_pageio(void)
{
io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
if (io_end_cachep == NULL)
return -ENOMEM;
+
+ io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
+ if (io_end_vec_cachep == NULL) {
+ kmem_cache_destroy(io_end_cachep);
+ return -ENOMEM;
+ }
return 0;
}
void ext4_exit_pageio(void)
{
kmem_cache_destroy(io_end_cachep);
+ kmem_cache_destroy(io_end_vec_cachep);
+}
+
+struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec;
+
+ io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
+ if (!io_end_vec)
+ return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&io_end_vec->list);
+ list_add_tail(&io_end_vec->list, &io_end->list_vec);
+ return io_end_vec;
+}
+
+static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec, *tmp;
+
+ if (list_empty(&io_end->list_vec))
+ return;
+ list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
+ list_del(&io_end_vec->list);
+ kmem_cache_free(io_end_vec_cachep, io_end_vec);
+ }
+}
+
+struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
+{
+ BUG_ON(list_empty(&io_end->list_vec));
+ return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
}
/*
@@ -133,6 +171,7 @@
ext4_finish_bio(bio);
bio_put(bio);
}
+ ext4_free_io_end_vec(io_end);
kmem_cache_free(io_end_cachep, io_end);
}
@@ -147,8 +186,6 @@
static int ext4_end_io_end(ext4_io_end_t *io_end)
{
struct inode *inode = io_end->inode;
- loff_t offset = io_end->offset;
- ssize_t size = io_end->size;
handle_t *handle = io_end->handle;
int ret = 0;
@@ -162,8 +199,7 @@
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
- "(inode %lu, offset %llu, size %zd, error %d)",
- inode->i_ino, offset, size, ret);
+ "(inode %lu, error %d)", inode->i_ino, ret);
}
ext4_clear_io_unwritten_flag(io_end);
ext4_release_io_end(io_end);
@@ -255,6 +291,7 @@
if (io_end) {
io_end->inode = inode;
INIT_LIST_HEAD(&io_end->list);
+ INIT_LIST_HEAD(&io_end->list_vec);
atomic_set(&io_end->count, 1);
}
return io_end;
@@ -263,7 +300,8 @@
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (atomic_dec_and_test(&io_end->count)) {
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
+ list_empty(&io_end->list_vec)) {
ext4_release_io_end(io_end);
return;
}
@@ -315,10 +353,8 @@
struct inode *inode = io_end->inode;
ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
- "(offset %llu size %ld starting block %llu)",
+ "starting block %llu)",
bio->bi_status, inode->i_ino,
- (unsigned long long) io_end->offset,
- (long) io_end->size,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
mapping_set_error(inode->i_mapping,