[Tux3] RFC - Delayed allocation for Tux3

Liu XiaoFeng bladehliu at gmail.com
Thu Jan 15 06:52:47 PST 2009


Sorry, this patch has a fault. Block reservation should not be called if
tux3_get_block_delay() return a mapped buffer.

I'll re-send it tomorrow morning.


On Thu, Jan 15, 2009 at 10:25 PM, Liu Hui <onlyflyer at gmail.com> wrote:

> I think delay allocation is not only just for allocating blocks or
> extents at write back time but also used to merge block allocation
> operations which will reduce allocation times and disk fragementation.


Yep. Delay allocation reduces buffered-write latency and disk fragement as
well.


>
> IMHO, this patch indeed delays the allocation operations but not
> really understand the motive of delay allocations.



Thanks. It is a very simple implementation by using the nobh routines to
defer block allocation and a private page flag to reserve space.


>
>
> 2009/1/15 Liu XiaoFeng <bladehliu at gmail.com>:
> > Here is a simple implementation of delayed allocation for Tux3.
> >
> > Delayed allocation defers block allocation from
> prepare-write(write-begin)
> > time to page writeback time. It is a powerful technique and implemented
> by
> > several filesystems such as XFS, ext4, and btrfs.
> >
> > Unlike ext4's delalloc, this implementation is independent with extent
> tree
> > structure.
> >
> > Signed-off by XiaoFeng Liu.
> >
> > ---
> >
> >  balloc.c  |    1
> >  filemap.c |  108
> > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  inode.c   |    2 -
> >  super.c   |    7 ++++
> >  trace.h   |   11 ++++++
> >  tux3.h    |   30 +++++++++++++++++
> >  6 files changed, 158 insertions(+), 1 deletion(-)
> >
> >
> > diff -pNur tux-orig/balloc.c tux-hack/balloc.c
> > --- tux-orig/balloc.c    2009-01-14 20:00:22.000000000 +0800
> > +++ tux-hack/balloc.c    2009-01-15 14:29:26.000000000 +0800
> > @@ -281,6 +281,7 @@ int bfree(struct sb *sb, block_t start,
> >      clear_bits(bufdata(buffer), start, blocks);
> >      brelse_dirty(buffer);
> >      sb->freeblocks += blocks;
> > +    tux3_release_blocks(sb, blocks);
> >      //set_sb_dirty(sb);
> >      mutex_unlock(&sb->bitmap->i_mutex);
> >      return 0;
> > diff -pNur tux-orig/filemap.c tux-hack/filemap.c
> > --- tux-orig/filemap.c    2009-01-14 20:00:22.000000000 +0800
> > +++ tux-hack/filemap.c    2009-01-15 14:25:07.000000000 +0800
> > @@ -505,4 +505,112 @@ const struct address_space_operations tu
> >      .sync_page    = block_sync_page,
> >      .write_begin    = tux3_vol_write_begin,
> >  };
> > +
> > +
> > +/*
> > + * Tux3's delayed allocation
> > + * Note: support blocksize == pagesize only
> > + * Written by XiaoFeng LIU <xfengliu at mail.ustc.edu.cn>
> > + */
> > +
> > +/* proof of concept */
> > +#define NR_RESERV_BLOCKS    32
> > +
> > +static int tux3_da_reserve_blocks(struct super_block *sb, int count)
> > +{
> > +    long free_blocks;
> > +    struct sb *sbi = tux_sb(sb);
> > +       free_blocks =
> percpu_counter_read_positive(freeblocks_counter(sbi));
> > +    xtrace("freeblocks_counter %ld", free_blocks);
> > +
> > +    if (free_blocks < count + NR_RESERV_BLOCKS)
> > +        return -ENOSPC;
> > +    percpu_counter_sub(freeblocks_counter(sbi), count);
> > +    return 0;
> > +}
> > +
> > +static void tux3_da_release_blocks(struct super_block *sb, int count)
> > +{
> > +    struct sb *sbi = tux_sb(sb);
> > +    if (count) {
> > +        percpu_counter_add(freeblocks_counter(sbi), count);
> > +        sb->s_dirt = 1;
> > +    }
> > +}
> > +
> > +static int tux3_get_block_delay(struct inode *inode, sector_t iblock,
> > +                struct buffer_head *bh_rslt, int create)
> > +{
> > +    return tux3_get_block(inode, iblock, bh_rslt, 0);
> > +}
> > +
> > +/*
> > + * a get_block() called at the writeout time.
> > + */
> > +static int tux3_get_block_write(struct inode *inode, sector_t iblock,
> > +                struct buffer_head *bh_rslt, int create)
> > +{
> > +    pgoff_t index = (pgoff_t) (iblock >> (PAGE_CACHE_SHIFT -
> > inode->i_blkbits));
> > +    struct page *page = find_get_page(inode->i_mapping, index);
> > +
> > +    /* the page should be here, and dirty */
> > +    if (unlikely(!page)) {
> > +        xtrace("find_get_page ret NULL.");
> > +        goto out;
> > +    }
> > +    if (create && PageChecked(page)) {
> > +             ClearPageChecked(page);
> > +        tux3_da_release_blocks(inode->i_sb, 1);
> > +    }
> > +    if (page)
> > +        page_cache_release(page);
> > +
> > +out:
> > +    return tux3_get_block(inode, iblock, bh_rslt, create);
> > +}
> > +
> > +static int tux3_da_write_begin(struct file *file, struct address_space
> > *mapping,
> > +                loff_t pos, unsigned len, unsigned flags,
> > +                struct page **pagep, void **fsdata)
> > +{
> > +    return nobh_write_begin(file, mapping, pos, len, flags, pagep,
> fsdata,
> > +                tux3_get_block_delay);
> > +}
> > +
> > +static int tux3_da_write_end(struct file *file, struct address_space
> > *mapping,
> > +                loff_t    pos, unsigned len, unsigned copied,
> > +                struct page *page, void *fsdata)
> > +{
> > +    if (!PageChecked(page)) {
> > +        int ret = tux3_da_reserve_blocks(mapping->host->i_sb, 1);
> > +        if (ret)
> > +            return ret;
> > +        SetPageChecked(page);
> > +    }
> > +           return nobh_write_end(file, mapping, pos, len, copied, page,
> > fsdata);
> > +}
> > +
> > +static int tux3_da_writepage(struct page *page, struct writeback_control
> > *wbc)
> > +{
> > +    return nobh_writepage(page, tux3_get_block_write, wbc);
> > +}
> > +static int tux3_da_writepages(struct address_space *mapping,
> > +                struct writeback_control *wbc)
> > +{
> > +    return mpage_writepages(mapping, wbc, tux3_get_block_write);
> > +}
> > +
> > +const struct address_space_operations tux_da_aops = {
> > +    .readpage        = tux3_readpage,
> > +    .readpages        = tux3_readpages,
> > +    .writepage        = tux3_da_writepage,
> > +    .writepages        = tux3_da_writepages,
> > +    .sync_page        = block_sync_page,
> > +    .write_begin        = tux3_da_write_begin,
> > +    .write_end        = tux3_da_write_end,
> > +    .bmap            = tux3_bmap,
> > +    .direct_IO        = tux3_direct_IO,
> > +    .migratepage        = buffer_migrate_page,
> > +};
> > +
> >  #endif /* __KERNEL__ */
> > diff -pNur tux-orig/inode.c tux-hack/inode.c
> > --- tux-orig/inode.c    2009-01-14 20:00:22.000000000 +0800
> > +++ tux-hack/inode.c    2009-01-15 15:29:43.000000000 +0800
> > @@ -438,7 +438,7 @@ static void tux_setup_inode(struct inode
> >      case S_IFREG:
> >          inode->i_op = &tux_file_iops;
> >          inode->i_fop = &tux_file_fops;
> > -        inode->i_mapping->a_ops = &tux_aops;
> > +        inode->i_mapping->a_ops = &tux_da_aops;
> >          break;
> >      case S_IFDIR:
> >          inode->i_op = &tux_dir_iops;
> > diff -pNur tux-orig/super.c tux-hack/super.c
> > --- tux-orig/super.c    2009-01-14 20:00:22.000000000 +0800
> > +++ tux-hack/super.c    2009-01-15 14:27:53.000000000 +0800
> > @@ -106,6 +106,9 @@ static void tux3_put_super(struct super_
> >      iput(sbi->volmap);
> >      iput(sbi->logmap);
> >
> > +    /* destroy block allocation info */
> > +    tux3_balloc_info_destroy(sbi);
> > +
> >      sb->s_fs_info = NULL;
> >      kfree(sbi);
> >  }
> > @@ -172,6 +175,10 @@ static int tux3_fill_super(struct super_
> >      err = tux_load_sb(sb, silent);
> >      if (err)
> >          goto error;
> > +
> > +    /* initialize block allocation info */
> > +    tux3_balloc_info_init(sbi);
> > +
> >      printk("%s: sb %p, ops %p, depth %Lu, block %Lu, entries_per_leaf
> > %d\n",
> >             __func__,
> >             sbi->itable.sb, sbi->itable.ops,
> > diff -pNur tux-orig/trace.h tux-hack/trace.h
> > --- tux-orig/trace.h    2009-01-14 20:00:22.000000000 +0800
> > +++ tux-hack/trace.h    2009-01-15 15:04:49.000000000 +0800
> > @@ -22,4 +22,15 @@
> >      die(100);                \
> >  } while (0)
> >
> > +
> > +#ifdef __KERNEL__
> > +/* debug macro, xiaofeng */
> > +#define xtrace(f, a...)        { \
> > +                    printk ("(%s, %d): %s:", \
> > +                        __FILE__, __LINE__, __FUNCTION__); \
> > +                                        printk (f, ## a); \
> > +                                            printk ("\n"); \
> > +                            }
> > +
> > +#endif
> >  #endif
> > diff -pNur tux-orig/tux3.h tux-hack/tux3.h
> > --- tux-orig/tux3.h    2009-01-14 20:00:22.000000000 +0800
> > +++ tux-hack/tux3.h    2009-01-15 15:34:37.000000000 +0800
> > @@ -9,6 +9,8 @@
> >  #include <linux/fs.h>
> >  #include <linux/buffer_head.h>
> >  #include <linux/mutex.h>
> > +#include <linux/mm.h>
> > +#include <linux/percpu_counter.h>
> >
> >  #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27)
> >  #include <linux/cred.h> // fsuid
> > @@ -213,6 +215,13 @@ struct cursor {
> >      } path[];
> >  };
> >
> > +/* Tux3 block allocation information */
> > +struct tux3_balloc_info {
> > +    struct percpu_counter freeblocks_counter;
> > +    /* nextalloc_counter, and others */
> > +};
> > +#define freeblocks_counter(sbi)
>  (&sbi->balloc_info.freeblocks_counter)
> > +
> >  /* Tux3-specific sb is a handle for the entire volume state */
> >
> >  struct sb {
> > @@ -241,6 +250,7 @@ struct sb {
> >      struct mutex loglock; /* serialize log entries (spinlock me) */
> >  #ifdef __KERNEL__
> >      struct super_block *vfs_sb; /* Generic kernel superblock */
> > +    struct tux3_balloc_info balloc_info;    /* control info for block
> > allocation */
> >  #else
> >      struct dev *dev;        /* userspace block device */
> >  #endif
> > @@ -620,6 +630,25 @@ static inline struct inode *buffer_inode
> >      return buffer->b_page->mapping->host;
> >  }
> >
> > +static inline void tux3_balloc_info_init(struct sb* sbi)
> > +{
> > +    percpu_counter_init(freeblocks_counter(sbi), sbi->freeblocks);
> > +}
> > +static inline void tux3_balloc_info_destroy(struct sb* sbi)
> > +{
> > +    percpu_counter_destroy(freeblocks_counter(sbi));
> > +}
> > +
> > +static inline void tux3_release_blocks(struct sb* sbi, int count)
> > +{
> > +    percpu_counter_add(freeblocks_counter(sbi), count);
> > +}
> > +
> > +static inline void tux3_reserve_blocks(struct sb* sbi, int count)
> > +{
> > +    percpu_counter_sub(freeblocks_counter(sbi), count);
> > +}
> > +
> >  /* btree.c */
> >  struct buffer_head *cursor_leafbuf(struct cursor *cursor);
> >  void release_cursor(struct cursor *cursor);
> > @@ -678,6 +707,7 @@ int tux3_get_block(struct inode *inode,
> >  extern const struct address_space_operations tux_aops;
> >  extern const struct address_space_operations tux_blk_aops;
> >  extern const struct address_space_operations tux_vol_aops;
> > +extern const struct address_space_operations tux_da_aops;
> >
> >  /* iattr.c */
> >  unsigned encode_asize(unsigned bits);
> >
> >
> > _______________________________________________
> > Tux3 mailing list
> > Tux3 at tux3.org
> > http://mailman.tux3.org/cgi-bin/mailman/listinfo/tux3
> >
> >
> --
> Thanks & Best Regards
> Liu Hui
> --
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://phunq.net/pipermail/tux3/attachments/20090115/81faaa5d/attachment-0001.html>
-------------- next part --------------
_______________________________________________
Tux3 mailing list
Tux3 at tux3.org
http://mailman.tux3.org/cgi-bin/mailman/listinfo/tux3


More information about the Tux3 mailing list