[Tux3] RFC - Delayed allocation for Tux3

Liu XiaoFeng bladehliu at gmail.com
Thu Jan 15 21:45:52 PST 2009


Hi Daniel:

On Fri, Jan 16, 2009 at 8:37 AM, Daniel Phillips <phillips at phunq.net> wrote:

> This is very cool.  Issues to consider:
>
>  * It should support variable block size, which all the rest of Tux3
>    already does.  Let's look at the what the tie to page size is, and
>    figure out what to do about it.


This implementation is very simple, using the nobh_ routines to defer block
allocation and a private page flag to reserve space.

Support for variable blocksize is achievable by utilizing buffer-head's
delay flag. Of course, more code is needed.

>
>  * It has to fit with atomic commit.  Details of atomic commit are
>    just now falling into place, so this is an iterative process.


I agree. I am about to the atomic commit part -:)

>
>
>  * We may not use mpage_writepages to drive delalloc, because it uses
>    the narrow ->get_block interface to map pages, forcing Tux3 to do
>    a btree probe for every block.  We really need to implement
>    ->writepages directly, using a more direct interface to tux3's
>    map_region that can map a bigger logical address range with a
>    single btree probe.


Yep. We need a tux3_da_writepages() that finds extents of pages and
maps/allocates chunks of contigous disk blocks.


>  * Planned merge of delalloc would be after we have atomic commit
>    working, so that we can enter the review cycle as early as
>    possible, with as simple a code base as possible.  But if
>    delalloc actually makes atomic commit easier then we will do it
>    now.  This question should be settled over the next few days,
>    I hope you will be involved in the discussion.


> Welcome to the Tux3 hall of fame!  We can really use another developer
> with the level of VFS skill that you obviously have.
>
> Could you please have a look at the new block-oriented page cache
> interfaces?
>
>   http://mailman.tux3.org/pipermail/tux3/2009-January/000657.html
>   "Polymorphic blockread for kernel"
>

Yes. Very glad to.


>
> This may help in thinking about how to do the variable size block
> support.  I will post a full patch pretty soon.
>
> More comments after I try your patch.
>

Please try the below revised patch (only one line changed) which does block
reservation correctly, if you like.

Besides the blocksize limit, it's free-block statistics might has problems
with truncate.

Please see this patch as proof-of-concept or premature. I'll re-work on it
after reading all the tux3 code.

Many thanks for your comments and encouragement.

Regards, xiaofeng

--

diff -pNur tux-orig/balloc.c tux-hack/balloc.c
--- tux-orig/balloc.c    2009-01-14 20:00:22.000000000 +0800
+++ tux-hack/balloc.c    2009-01-15 14:29:26.000000000 +0800
@@ -281,6 +281,7 @@ int bfree(struct sb *sb, block_t start,
     clear_bits(bufdata(buffer), start, blocks);
     brelse_dirty(buffer);
     sb->freeblocks += blocks;
+    tux3_release_blocks(sb, blocks);
     //set_sb_dirty(sb);
     mutex_unlock(&sb->bitmap->i_mutex);
     return 0;
diff -pNur tux-orig/filemap.c tux-hack/filemap.c
--- tux-orig/filemap.c    2009-01-14 20:00:22.000000000 +0800
+++ tux-hack/filemap.c    2009-01-16 11:37:20.000000000 +0800
@@ -505,4 +505,117 @@ const struct address_space_operations tu
     .sync_page    = block_sync_page,
     .write_begin    = tux3_vol_write_begin,
 };
+
+
+/*
+ * Tux3's delayed allocation
+ * Note: support blocksize == pagesize only
+ * Written by XiaoFeng LIU <xfengliu at mail.ustc.edu.cn>
+ */
+
+/* proof of concept */
+#define NR_RESERV_BLOCKS    32
+
+static int tux3_da_reserve_blocks(struct super_block *sb, int count)
+{
+    long free_blocks;
+    struct sb *sbi = tux_sb(sb);
+       free_blocks = percpu_counter_read_positive(freeblocks_counter(sbi));
+    xtrace("freeblocks_counter %ld", free_blocks);
+
+    if (free_blocks < count + NR_RESERV_BLOCKS)
+        return -ENOSPC;
+    percpu_counter_sub(freeblocks_counter(sbi), count);
+    return 0;
+}
+
+static void tux3_da_release_blocks(struct super_block *sb, int count)
+{
+    struct sb *sbi = tux_sb(sb);
+    if (count) {
+        percpu_counter_add(freeblocks_counter(sbi), count);
+        sb->s_dirt = 1;
+    }
+}
+
+static int tux3_get_block_delay(struct inode *inode, sector_t iblock,
+                struct buffer_head *bh_rslt, int create)
+{
+    return tux3_get_block(inode, iblock, bh_rslt, 0);
+}
+
+/*
+ * a get_block() called at the writeout time.
+ */
+static int tux3_get_block_write(struct inode *inode, sector_t iblock,
+                struct buffer_head *bh_rslt, int create)
+{
+    pgoff_t index = (pgoff_t) (iblock >> (PAGE_CACHE_SHIFT -
inode->i_blkbits));
+    struct page *page = find_get_page(inode->i_mapping, index);
+
+    /* the page should be here, and dirty */
+    if (unlikely(!page)) {
+        xtrace("find_get_page ret NULL.");
+        goto out;
+    }
+    if (create && PageChecked(page)) {
+             ClearPageChecked(page);
+        tux3_da_release_blocks(inode->i_sb, 1);
+    }
+    if (page)
+        page_cache_release(page);
+
+out:
+    return tux3_get_block(inode, iblock, bh_rslt, create);
+}
+
+static int tux3_da_write_begin(struct file *file, struct address_space
*mapping,
+                loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, void **fsdata)
+{
+    return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+                tux3_get_block_delay);
+}
+
+static int tux3_da_write_end(struct file *file, struct address_space
*mapping,
+                loff_t    pos, unsigned len, unsigned copied,
+                struct page *page, void *fsdata)
+{
+    /*
+     * Never do block reservation if the block has been allocated.
+     * In that case, tux3_da_write_begin sets the page mapped-to-disk.
+     */
+    if (!PageMappedToDisk(page) && !PageChecked(page)) {
+        int ret = tux3_da_reserve_blocks(mapping->host->i_sb, 1);
+        if (ret)
+            return ret;
+        SetPageChecked(page);
+    }
+
+    return nobh_write_end(file, mapping, pos, len, copied, page, fsdata);
+}
+
+static int tux3_da_writepage(struct page *page, struct writeback_control
*wbc)
+{
+    return nobh_writepage(page, tux3_get_block_write, wbc);
+}
+static int tux3_da_writepages(struct address_space *mapping,
+                struct writeback_control *wbc)
+{
+    return mpage_writepages(mapping, wbc, tux3_get_block_write);
+}
+
+const struct address_space_operations tux_da_aops = {
+    .readpage        = tux3_readpage,
+    .readpages        = tux3_readpages,
+    .writepage        = tux3_da_writepage,
+    .writepages        = tux3_da_writepages,
+    .sync_page        = block_sync_page,
+    .write_begin        = tux3_da_write_begin,
+    .write_end        = tux3_da_write_end,
+    .bmap            = tux3_bmap,
+    .direct_IO        = tux3_direct_IO,
+    .migratepage        = buffer_migrate_page,
+};
+
 #endif /* __KERNEL__ */
diff -pNur tux-orig/inode.c tux-hack/inode.c
--- tux-orig/inode.c    2009-01-14 20:00:22.000000000 +0800
+++ tux-hack/inode.c    2009-01-15 15:29:43.000000000 +0800
@@ -438,7 +438,7 @@ static void tux_setup_inode(struct inode
     case S_IFREG:
         inode->i_op = &tux_file_iops;
         inode->i_fop = &tux_file_fops;
-        inode->i_mapping->a_ops = &tux_aops;
+        inode->i_mapping->a_ops = &tux_da_aops;
         break;
     case S_IFDIR:
         inode->i_op = &tux_dir_iops;
diff -pNur tux-orig/modules.order tux-hack/modules.order
--- tux-orig/modules.order    1970-01-01 08:00:00.000000000 +0800
+++ tux-hack/modules.order    2009-01-16 10:08:26.000000000 +0800
@@ -0,0 +1 @@
+kernel//home/xiaofeng/tux3bed/tux-hack/tux3.ko
diff -pNur tux-orig/super.c tux-hack/super.c
--- tux-orig/super.c    2009-01-14 20:00:22.000000000 +0800
+++ tux-hack/super.c    2009-01-15 14:27:53.000000000 +0800
@@ -106,6 +106,9 @@ static void tux3_put_super(struct super_
     iput(sbi->volmap);
     iput(sbi->logmap);

+    /* destroy block allocation info */
+    tux3_balloc_info_destroy(sbi);
+
     sb->s_fs_info = NULL;
     kfree(sbi);
 }
@@ -172,6 +175,10 @@ static int tux3_fill_super(struct super_
     err = tux_load_sb(sb, silent);
     if (err)
         goto error;
+
+    /* initialize block allocation info */
+    tux3_balloc_info_init(sbi);
+
     printk("%s: sb %p, ops %p, depth %Lu, block %Lu, entries_per_leaf
%d\n",
            __func__,
            sbi->itable.sb, sbi->itable.ops,
diff -pNur tux-orig/trace.h tux-hack/trace.h
--- tux-orig/trace.h    2009-01-14 20:00:22.000000000 +0800
+++ tux-hack/trace.h    2009-01-15 15:04:49.000000000 +0800
@@ -22,4 +22,15 @@
     die(100);                \
 } while (0)

+
+#ifdef __KERNEL__
+/* debug macro, xiaofeng */
+#define xtrace(f, a...)        { \
+                    printk ("(%s, %d): %s:", \
+                        __FILE__, __LINE__, __FUNCTION__); \
+                                        printk (f, ## a); \
+                                            printk ("\n"); \
+                            }
+
+#endif
 #endif
diff -pNur tux-orig/tux3.h tux-hack/tux3.h
--- tux-orig/tux3.h    2009-01-14 20:00:22.000000000 +0800
+++ tux-hack/tux3.h    2009-01-15 15:34:37.000000000 +0800
@@ -9,6 +9,8 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/mutex.h>
+#include <linux/mm.h>
+#include <linux/percpu_counter.h>

 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27)
 #include <linux/cred.h> // fsuid
@@ -213,6 +215,13 @@ struct cursor {
     } path[];
 };

+/* Tux3 block allocation information */
+struct tux3_balloc_info {
+    struct percpu_counter freeblocks_counter;
+    /* nextalloc_counter, and others */
+};
+#define freeblocks_counter(sbi)    (&sbi->balloc_info.freeblocks_counter)
+
 /* Tux3-specific sb is a handle for the entire volume state */

 struct sb {
@@ -241,6 +250,7 @@ struct sb {
     struct mutex loglock; /* serialize log entries (spinlock me) */
 #ifdef __KERNEL__
     struct super_block *vfs_sb; /* Generic kernel superblock */
+    struct tux3_balloc_info balloc_info;    /* control info for block
allocation */
 #else
     struct dev *dev;        /* userspace block device */
 #endif
@@ -620,6 +630,25 @@ static inline struct inode *buffer_inode
     return buffer->b_page->mapping->host;
 }

+static inline void tux3_balloc_info_init(struct sb* sbi)
+{
+    percpu_counter_init(freeblocks_counter(sbi), sbi->freeblocks);
+}
+static inline void tux3_balloc_info_destroy(struct sb* sbi)
+{
+    percpu_counter_destroy(freeblocks_counter(sbi));
+}
+
+static inline void tux3_release_blocks(struct sb* sbi, int count)
+{
+    percpu_counter_add(freeblocks_counter(sbi), count);
+}
+
+static inline void tux3_reserve_blocks(struct sb* sbi, int count)
+{
+    percpu_counter_sub(freeblocks_counter(sbi), count);
+}
+
 /* btree.c */
 struct buffer_head *cursor_leafbuf(struct cursor *cursor);
 void release_cursor(struct cursor *cursor);
@@ -678,6 +707,7 @@ int tux3_get_block(struct inode *inode,
 extern const struct address_space_operations tux_aops;
 extern const struct address_space_operations tux_blk_aops;
 extern const struct address_space_operations tux_vol_aops;
+extern const struct address_space_operations tux_da_aops;

 /* iattr.c */
 unsigned encode_asize(unsigned bits);
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://phunq.net/pipermail/tux3/attachments/20090116/b95eb0da/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: tux3-da-2.patch
Type: text/x-patch
Size: 8050 bytes
Desc: not available
URL: <http://phunq.net/pipermail/tux3/attachments/20090116/b95eb0da/attachment-0001.bin>
-------------- next part --------------
_______________________________________________
Tux3 mailing list
Tux3 at tux3.org
http://mailman.tux3.org/cgi-bin/mailman/listinfo/tux3


More information about the Tux3 mailing list