[Tux3] RFC - Delayed allocation for Tux3
Liu Hui
onlyflyer at gmail.com
Thu Jan 15 06:25:44 PST 2009
I think delay allocation is not only just for allocating blocks or
extents at write back time but also used to merge block allocation
operations which will reduce allocation times and disk fragementation.
IMHO, this patch indeed delays the allocation operations but not
really understand the motive of delay allocations.
2009/1/15 Liu XiaoFeng <bladehliu at gmail.com>:
> Here is a simple implementation of delayed allocation for Tux3.
>
> Delayed allocation defers block allocation from prepare-write(write-begin)
> time to page writeback time. It is a powerful technique and implemented by
> several filesystems such as XFS, ext4, and btrfs.
>
> Unlike ext4's delalloc, this implementation is independent with extent tree
> structure.
>
> Signed-off by XiaoFeng Liu.
>
> ---
>
> balloc.c | 1
> filemap.c | 108
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> inode.c | 2 -
> super.c | 7 ++++
> trace.h | 11 ++++++
> tux3.h | 30 +++++++++++++++++
> 6 files changed, 158 insertions(+), 1 deletion(-)
>
>
> diff -pNur tux-orig/balloc.c tux-hack/balloc.c
> --- tux-orig/balloc.c 2009-01-14 20:00:22.000000000 +0800
> +++ tux-hack/balloc.c 2009-01-15 14:29:26.000000000 +0800
> @@ -281,6 +281,7 @@ int bfree(struct sb *sb, block_t start,
> clear_bits(bufdata(buffer), start, blocks);
> brelse_dirty(buffer);
> sb->freeblocks += blocks;
> + tux3_release_blocks(sb, blocks);
> //set_sb_dirty(sb);
> mutex_unlock(&sb->bitmap->i_mutex);
> return 0;
> diff -pNur tux-orig/filemap.c tux-hack/filemap.c
> --- tux-orig/filemap.c 2009-01-14 20:00:22.000000000 +0800
> +++ tux-hack/filemap.c 2009-01-15 14:25:07.000000000 +0800
> @@ -505,4 +505,112 @@ const struct address_space_operations tu
> .sync_page = block_sync_page,
> .write_begin = tux3_vol_write_begin,
> };
> +
> +
> +/*
> + * Tux3's delayed allocation
> + * Note: support blocksize == pagesize only
> + * Written by XiaoFeng LIU <xfengliu at mail.ustc.edu.cn>
> + */
> +
> +/* proof of concept */
> +#define NR_RESERV_BLOCKS 32
> +
> +static int tux3_da_reserve_blocks(struct super_block *sb, int count)
> +{
> + long free_blocks;
> + struct sb *sbi = tux_sb(sb);
> + free_blocks = percpu_counter_read_positive(freeblocks_counter(sbi));
> + xtrace("freeblocks_counter %ld", free_blocks);
> +
> + if (free_blocks < count + NR_RESERV_BLOCKS)
> + return -ENOSPC;
> + percpu_counter_sub(freeblocks_counter(sbi), count);
> + return 0;
> +}
> +
> +static void tux3_da_release_blocks(struct super_block *sb, int count)
> +{
> + struct sb *sbi = tux_sb(sb);
> + if (count) {
> + percpu_counter_add(freeblocks_counter(sbi), count);
> + sb->s_dirt = 1;
> + }
> +}
> +
> +static int tux3_get_block_delay(struct inode *inode, sector_t iblock,
> + struct buffer_head *bh_rslt, int create)
> +{
> + return tux3_get_block(inode, iblock, bh_rslt, 0);
> +}
> +
> +/*
> + * a get_block() called at the writeout time.
> + */
> +static int tux3_get_block_write(struct inode *inode, sector_t iblock,
> + struct buffer_head *bh_rslt, int create)
> +{
> + pgoff_t index = (pgoff_t) (iblock >> (PAGE_CACHE_SHIFT -
> inode->i_blkbits));
> + struct page *page = find_get_page(inode->i_mapping, index);
> +
> + /* the page should be here, and dirty */
> + if (unlikely(!page)) {
> + xtrace("find_get_page ret NULL.");
> + goto out;
> + }
> + if (create && PageChecked(page)) {
> + ClearPageChecked(page);
> + tux3_da_release_blocks(inode->i_sb, 1);
> + }
> + if (page)
> + page_cache_release(page);
> +
> +out:
> + return tux3_get_block(inode, iblock, bh_rslt, create);
> +}
> +
> +static int tux3_da_write_begin(struct file *file, struct address_space
> *mapping,
> + loff_t pos, unsigned len, unsigned flags,
> + struct page **pagep, void **fsdata)
> +{
> + return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
> + tux3_get_block_delay);
> +}
> +
> +static int tux3_da_write_end(struct file *file, struct address_space
> *mapping,
> + loff_t pos, unsigned len, unsigned copied,
> + struct page *page, void *fsdata)
> +{
> + if (!PageChecked(page)) {
> + int ret = tux3_da_reserve_blocks(mapping->host->i_sb, 1);
> + if (ret)
> + return ret;
> + SetPageChecked(page);
> + }
> + return nobh_write_end(file, mapping, pos, len, copied, page,
> fsdata);
> +}
> +
> +static int tux3_da_writepage(struct page *page, struct writeback_control
> *wbc)
> +{
> + return nobh_writepage(page, tux3_get_block_write, wbc);
> +}
> +static int tux3_da_writepages(struct address_space *mapping,
> + struct writeback_control *wbc)
> +{
> + return mpage_writepages(mapping, wbc, tux3_get_block_write);
> +}
> +
> +const struct address_space_operations tux_da_aops = {
> + .readpage = tux3_readpage,
> + .readpages = tux3_readpages,
> + .writepage = tux3_da_writepage,
> + .writepages = tux3_da_writepages,
> + .sync_page = block_sync_page,
> + .write_begin = tux3_da_write_begin,
> + .write_end = tux3_da_write_end,
> + .bmap = tux3_bmap,
> + .direct_IO = tux3_direct_IO,
> + .migratepage = buffer_migrate_page,
> +};
> +
> #endif /* __KERNEL__ */
> diff -pNur tux-orig/inode.c tux-hack/inode.c
> --- tux-orig/inode.c 2009-01-14 20:00:22.000000000 +0800
> +++ tux-hack/inode.c 2009-01-15 15:29:43.000000000 +0800
> @@ -438,7 +438,7 @@ static void tux_setup_inode(struct inode
> case S_IFREG:
> inode->i_op = &tux_file_iops;
> inode->i_fop = &tux_file_fops;
> - inode->i_mapping->a_ops = &tux_aops;
> + inode->i_mapping->a_ops = &tux_da_aops;
> break;
> case S_IFDIR:
> inode->i_op = &tux_dir_iops;
> diff -pNur tux-orig/super.c tux-hack/super.c
> --- tux-orig/super.c 2009-01-14 20:00:22.000000000 +0800
> +++ tux-hack/super.c 2009-01-15 14:27:53.000000000 +0800
> @@ -106,6 +106,9 @@ static void tux3_put_super(struct super_
> iput(sbi->volmap);
> iput(sbi->logmap);
>
> + /* destroy block allocation info */
> + tux3_balloc_info_destroy(sbi);
> +
> sb->s_fs_info = NULL;
> kfree(sbi);
> }
> @@ -172,6 +175,10 @@ static int tux3_fill_super(struct super_
> err = tux_load_sb(sb, silent);
> if (err)
> goto error;
> +
> + /* initialize block allocation info */
> + tux3_balloc_info_init(sbi);
> +
> printk("%s: sb %p, ops %p, depth %Lu, block %Lu, entries_per_leaf
> %d\n",
> __func__,
> sbi->itable.sb, sbi->itable.ops,
> diff -pNur tux-orig/trace.h tux-hack/trace.h
> --- tux-orig/trace.h 2009-01-14 20:00:22.000000000 +0800
> +++ tux-hack/trace.h 2009-01-15 15:04:49.000000000 +0800
> @@ -22,4 +22,15 @@
> die(100); \
> } while (0)
>
> +
> +#ifdef __KERNEL__
> +/* debug macro, xiaofeng */
> +#define xtrace(f, a...) { \
> + printk ("(%s, %d): %s:", \
> + __FILE__, __LINE__, __FUNCTION__); \
> + printk (f, ## a); \
> + printk ("\n"); \
> + }
> +
> +#endif
> #endif
> diff -pNur tux-orig/tux3.h tux-hack/tux3.h
> --- tux-orig/tux3.h 2009-01-14 20:00:22.000000000 +0800
> +++ tux-hack/tux3.h 2009-01-15 15:34:37.000000000 +0800
> @@ -9,6 +9,8 @@
> #include <linux/fs.h>
> #include <linux/buffer_head.h>
> #include <linux/mutex.h>
> +#include <linux/mm.h>
> +#include <linux/percpu_counter.h>
>
> #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27)
> #include <linux/cred.h> // fsuid
> @@ -213,6 +215,13 @@ struct cursor {
> } path[];
> };
>
> +/* Tux3 block allocation information */
> +struct tux3_balloc_info {
> + struct percpu_counter freeblocks_counter;
> + /* nextalloc_counter, and others */
> +};
> +#define freeblocks_counter(sbi) (&sbi->balloc_info.freeblocks_counter)
> +
> /* Tux3-specific sb is a handle for the entire volume state */
>
> struct sb {
> @@ -241,6 +250,7 @@ struct sb {
> struct mutex loglock; /* serialize log entries (spinlock me) */
> #ifdef __KERNEL__
> struct super_block *vfs_sb; /* Generic kernel superblock */
> + struct tux3_balloc_info balloc_info; /* control info for block
> allocation */
> #else
> struct dev *dev; /* userspace block device */
> #endif
> @@ -620,6 +630,25 @@ static inline struct inode *buffer_inode
> return buffer->b_page->mapping->host;
> }
>
> +static inline void tux3_balloc_info_init(struct sb* sbi)
> +{
> + percpu_counter_init(freeblocks_counter(sbi), sbi->freeblocks);
> +}
> +static inline void tux3_balloc_info_destroy(struct sb* sbi)
> +{
> + percpu_counter_destroy(freeblocks_counter(sbi));
> +}
> +
> +static inline void tux3_release_blocks(struct sb* sbi, int count)
> +{
> + percpu_counter_add(freeblocks_counter(sbi), count);
> +}
> +
> +static inline void tux3_reserve_blocks(struct sb* sbi, int count)
> +{
> + percpu_counter_sub(freeblocks_counter(sbi), count);
> +}
> +
> /* btree.c */
> struct buffer_head *cursor_leafbuf(struct cursor *cursor);
> void release_cursor(struct cursor *cursor);
> @@ -678,6 +707,7 @@ int tux3_get_block(struct inode *inode,
> extern const struct address_space_operations tux_aops;
> extern const struct address_space_operations tux_blk_aops;
> extern const struct address_space_operations tux_vol_aops;
> +extern const struct address_space_operations tux_da_aops;
>
> /* iattr.c */
> unsigned encode_asize(unsigned bits);
>
>
> _______________________________________________
> Tux3 mailing list
> Tux3 at tux3.org
> http://mailman.tux3.org/cgi-bin/mailman/listinfo/tux3
>
>
--
Thanks & Best Regards
Liu Hui
--
_______________________________________________
Tux3 mailing list
Tux3 at tux3.org
http://mailman.tux3.org/cgi-bin/mailman/listinfo/tux3
More information about the Tux3
mailing list