Skip to content

Commit

Permalink
f2fs: change the current atomic write way
Browse files Browse the repository at this point in the history
Current atomic write has three major issues like below.
 - keeps the updates in non-reclaimable memory space and they are even
   hard to be migrated, which is not good for contiguous memory
   allocation.
 - disk spaces used for atomic files cannot be garbage collected, so
   this makes it difficult for the filesystem to be defragmented.
 - If atomic write operations hit the threshold of either memory usage
   or garbage collection failure count, All the atomic write operations
   will fail immediately.

To resolve the issues, I will keep a COW inode internally for all the
updates to be flushed from memory, when we need to flush them out in a
situation like high memory pressure. These COW inodes will be tagged
as orphan inodes to be reclaimed in case of sudden power-cut or system
failure during atomic writes.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
  • Loading branch information
Daeho Jeong authored and Jaegeuk Kim committed May 12, 2022
1 parent 6213f5d commit 3db1de0
Show file tree
Hide file tree
Showing 13 changed files with 303 additions and 446 deletions.
180 changes: 115 additions & 65 deletions fs/f2fs/data.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@ static bool __is_cp_guaranteed(struct page *page)

if (f2fs_is_compressed_page(page))
return false;
if ((S_ISREG(inode->i_mode) &&
(f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) ||
if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) ||
page_private_gcing(page))
return true;
return false;
Expand Down Expand Up @@ -2563,7 +2562,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
bool ipu_force = false;
int err = 0;

set_new_dnode(&dn, inode, NULL, NULL, 0);
/* Use COW inode to make dnode_of_data for atomic write */
if (f2fs_is_atomic_file(inode))
set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0);
else
set_new_dnode(&dn, inode, NULL, NULL, 0);

if (need_inplace_update(fio) &&
f2fs_lookup_extent_cache(inode, page->index, &ei)) {
fio->old_blkaddr = ei.blk + page->index - ei.fofs;
Expand Down Expand Up @@ -2600,6 +2604,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
err = -EFSCORRUPTED;
goto out_writepage;
}

/*
* If current allocation needs SSR,
* it had better in-place writes for updated data.
Expand Down Expand Up @@ -3313,6 +3318,100 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
return err;
}

static int __find_data_block(struct inode *inode, pgoff_t index,
block_t *blk_addr)
{
struct dnode_of_data dn;
struct page *ipage;
struct extent_info ei = {0, };
int err = 0;

ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage))
return PTR_ERR(ipage);

set_new_dnode(&dn, inode, ipage, ipage, 0);

if (f2fs_lookup_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
} else {
/* hole case */
err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
if (err) {
dn.data_blkaddr = NULL_ADDR;
err = 0;
}
}
*blk_addr = dn.data_blkaddr;
f2fs_put_dnode(&dn);
return err;
}

static int __reserve_data_block(struct inode *inode, pgoff_t index,
block_t *blk_addr, bool *node_changed)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
struct page *ipage;
int err = 0;

f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);

ipage = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(ipage)) {
err = PTR_ERR(ipage);
goto unlock_out;
}
set_new_dnode(&dn, inode, ipage, ipage, 0);

err = f2fs_get_block(&dn, index);

*blk_addr = dn.data_blkaddr;
*node_changed = dn.node_changed;
f2fs_put_dnode(&dn);

unlock_out:
f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
return err;
}

static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi,
struct page *page, loff_t pos, unsigned int len,
block_t *blk_addr, bool *node_changed)
{
struct inode *inode = page->mapping->host;
struct inode *cow_inode = F2FS_I(inode)->cow_inode;
pgoff_t index = page->index;
int err = 0;
block_t ori_blk_addr;

/* If pos is beyond the end of file, reserve a new block in COW inode */
if ((pos & PAGE_MASK) >= i_size_read(inode))
return __reserve_data_block(cow_inode, index, blk_addr,
node_changed);

/* Look for the block in COW inode first */
err = __find_data_block(cow_inode, index, blk_addr);
if (err)
return err;
else if (*blk_addr != NULL_ADDR)
return 0;

/* Look for the block in the original inode */
err = __find_data_block(inode, index, &ori_blk_addr);
if (err)
return err;

/* Finally, we should reserve a new block in COW inode for the update */
err = __reserve_data_block(cow_inode, index, blk_addr, node_changed);
if (err)
return err;

if (ori_blk_addr != NULL_ADDR)
*blk_addr = ori_blk_addr;
return 0;
}

static int f2fs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
Expand All @@ -3321,7 +3420,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page = NULL;
pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
bool need_balance = false, drop_atomic = false;
bool need_balance = false;
block_t blkaddr = NULL_ADDR;
int err = 0;

Expand All @@ -3332,14 +3431,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
goto fail;
}

if ((f2fs_is_atomic_file(inode) &&
!f2fs_available_free_memory(sbi, INMEM_PAGES)) ||
is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) {
err = -ENOMEM;
drop_atomic = true;
goto fail;
}

/*
* We should check this at this moment to avoid deadlock on inode page
* and #0 page. The locking rule for inline_data conversion should be:
Expand Down Expand Up @@ -3387,7 +3478,11 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,

*pagep = page;

err = prepare_write_begin(sbi, page, pos, len,
if (f2fs_is_atomic_file(inode))
err = prepare_atomic_write_begin(sbi, page, pos, len,
&blkaddr, &need_balance);
else
err = prepare_write_begin(sbi, page, pos, len,
&blkaddr, &need_balance);
if (err)
goto fail;
Expand Down Expand Up @@ -3443,8 +3538,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
fail:
f2fs_put_page(page, 1);
f2fs_write_failed(inode, pos + len);
if (drop_atomic)
f2fs_drop_inmem_pages_all(sbi, false);
return err;
}

Expand Down Expand Up @@ -3488,8 +3581,12 @@ static int f2fs_write_end(struct file *file,
set_page_dirty(page);

if (pos + copied > i_size_read(inode) &&
!f2fs_verity_in_progress(inode))
!f2fs_verity_in_progress(inode)) {
f2fs_i_size_write(inode, pos + copied);
if (f2fs_is_atomic_file(inode))
f2fs_i_size_write(F2FS_I(inode)->cow_inode,
pos + copied);
}
unlock_out:
f2fs_put_page(page, 1);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
Expand Down Expand Up @@ -3522,9 +3619,6 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
inode->i_ino == F2FS_COMPRESS_INO(sbi))
clear_page_private_data(&folio->page);

if (page_private_atomic(&folio->page))
return f2fs_drop_inmem_page(inode, &folio->page);

folio_detach_private(folio);
}

Expand All @@ -3534,10 +3628,6 @@ int f2fs_release_page(struct page *page, gfp_t wait)
if (PageDirty(page))
return 0;

/* This is atomic written page, keep Private */
if (page_private_atomic(page))
return 0;

if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) {
struct inode *inode = page->mapping->host;

Expand All @@ -3563,18 +3653,6 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping,
folio_mark_uptodate(folio);
BUG_ON(folio_test_swapcache(folio));

if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
if (!page_private_atomic(&folio->page)) {
f2fs_register_inmem_page(inode, &folio->page);
return true;
}
/*
* Previously, this page has been registered, we just
* return here.
*/
return false;
}

if (!folio_test_dirty(folio)) {
filemap_dirty_folio(mapping, folio);
f2fs_update_dirty_folio(inode, folio);
Expand Down Expand Up @@ -3654,42 +3732,14 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
int f2fs_migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page, enum migrate_mode mode)
{
int rc, extra_count;
struct f2fs_inode_info *fi = F2FS_I(mapping->host);
bool atomic_written = page_private_atomic(page);
int rc, extra_count = 0;

BUG_ON(PageWriteback(page));

/* migrating an atomic written page is safe with the inmem_lock hold */
if (atomic_written) {
if (mode != MIGRATE_SYNC)
return -EBUSY;
if (!mutex_trylock(&fi->inmem_lock))
return -EAGAIN;
}

/* one extra reference was held for atomic_write page */
extra_count = atomic_written ? 1 : 0;
rc = migrate_page_move_mapping(mapping, newpage,
page, extra_count);
if (rc != MIGRATEPAGE_SUCCESS) {
if (atomic_written)
mutex_unlock(&fi->inmem_lock);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
}

if (atomic_written) {
struct inmem_pages *cur;

list_for_each_entry(cur, &fi->inmem_pages, list)
if (cur->page == page) {
cur->page = newpage;
break;
}
mutex_unlock(&fi->inmem_lock);
put_page(page);
get_page(newpage);
}

/* guarantee to start from no stale private field */
set_page_private(newpage, 0);
Expand Down
12 changes: 2 additions & 10 deletions fs/f2fs/debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
si->nquota_files = sbi->nquota_files;
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
si->aw_cnt = sbi->atomic_files;
si->vw_cnt = atomic_read(&sbi->vw_cnt);
si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
Expand Down Expand Up @@ -167,8 +166,6 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
si->io_skip_bggc = sbi->io_skip_bggc;
si->other_skip_bggc = sbi->other_skip_bggc;
si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC];
si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC];
si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
/ 2;
Expand Down Expand Up @@ -296,7 +293,6 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
sizeof(struct nat_entry);
si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] *
sizeof(struct nat_entry_set);
si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
for (i = 0; i < MAX_INO_ENTRY; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
si->cache_mem += atomic_read(&sbi->total_ext_tree) *
Expand Down Expand Up @@ -491,10 +487,6 @@ static int stat_show(struct seq_file *s, void *v)
si->bg_data_blks);
seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks,
si->bg_node_blks);
seq_printf(s, "Skipped : atomic write %llu (%llu)\n",
si->skipped_atomic_files[BG_GC] +
si->skipped_atomic_files[FG_GC],
si->skipped_atomic_files[BG_GC]);
seq_printf(s, "BG skip : IO: %u, Other: %u\n",
si->io_skip_bggc, si->other_skip_bggc);
seq_puts(s, "\nExtent Cache:\n");
Expand All @@ -519,9 +511,9 @@ static int stat_show(struct seq_file *s, void *v)
si->flush_list_empty,
si->nr_discarding, si->nr_discarded,
si->nr_discard_cmd, si->undiscard_blks);
seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), "
seq_printf(s, " - atomic IO: %4d (Max. %4d), "
"volatile IO: %4d (Max. %4d)\n",
si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
si->aw_cnt, si->max_aw_cnt,
si->vw_cnt, si->max_vw_cnt);
seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit);
seq_printf(s, " - nodes: %4d in %4d\n",
Expand Down
Loading

0 comments on commit 3db1de0

Please sign in to comment.