[Tux3] RFC - Delayed allocation for Tux3
Liu XiaoFeng
bladehliu at gmail.com
Thu Jan 15 02:24:52 PST 2009
Here is a simple implementation of delayed allocation for Tux3.
Delayed allocation defers block allocation from prepare-write(write-begin)
time to page writeback time. It is a powerful technique and implemented by
several filesystems such as XFS, ext4, and btrfs.
Unlike ext4's delalloc, this implementation is independent with extent tree
structure.
Signed-off by XiaoFeng Liu.
---
balloc.c | 1
filemap.c | 108
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
inode.c | 2 -
super.c | 7 ++++
trace.h | 11 ++++++
tux3.h | 30 +++++++++++++++++
6 files changed, 158 insertions(+), 1 deletion(-)
diff -pNur tux-orig/balloc.c tux-hack/balloc.c
--- tux-orig/balloc.c 2009-01-14 20:00:22.000000000 +0800
+++ tux-hack/balloc.c 2009-01-15 14:29:26.000000000 +0800
@@ -281,6 +281,7 @@ int bfree(struct sb *sb, block_t start,
clear_bits(bufdata(buffer), start, blocks);
brelse_dirty(buffer);
sb->freeblocks += blocks;
+ tux3_release_blocks(sb, blocks);
//set_sb_dirty(sb);
mutex_unlock(&sb->bitmap->i_mutex);
return 0;
diff -pNur tux-orig/filemap.c tux-hack/filemap.c
--- tux-orig/filemap.c 2009-01-14 20:00:22.000000000 +0800
+++ tux-hack/filemap.c 2009-01-15 14:25:07.000000000 +0800
@@ -505,4 +505,112 @@ const struct address_space_operations tu
.sync_page = block_sync_page,
.write_begin = tux3_vol_write_begin,
};
+
+
+/*
+ * Tux3's delayed allocation
+ * Note: support blocksize == pagesize only
+ * Written by XiaoFeng LIU <xfengliu at mail.ustc.edu.cn>
+ */
+
+/* proof of concept */
+#define NR_RESERV_BLOCKS 32
+
+static int tux3_da_reserve_blocks(struct super_block *sb, int count)
+{
+ long free_blocks;
+ struct sb *sbi = tux_sb(sb);
+ free_blocks = percpu_counter_read_positive(freeblocks_counter(sbi));
+ xtrace("freeblocks_counter %ld", free_blocks);
+
+ if (free_blocks < count + NR_RESERV_BLOCKS)
+ return -ENOSPC;
+ percpu_counter_sub(freeblocks_counter(sbi), count);
+ return 0;
+}
+
+static void tux3_da_release_blocks(struct super_block *sb, int count)
+{
+ struct sb *sbi = tux_sb(sb);
+ if (count) {
+ percpu_counter_add(freeblocks_counter(sbi), count);
+ sb->s_dirt = 1;
+ }
+}
+
+static int tux3_get_block_delay(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_rslt, int create)
+{
+ return tux3_get_block(inode, iblock, bh_rslt, 0);
+}
+
+/*
+ * a get_block() called at the writeout time.
+ */
+static int tux3_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_rslt, int create)
+{
+ pgoff_t index = (pgoff_t) (iblock >> (PAGE_CACHE_SHIFT -
inode->i_blkbits));
+ struct page *page = find_get_page(inode->i_mapping, index);
+
+ /* the page should be here, and dirty */
+ if (unlikely(!page)) {
+ xtrace("find_get_page ret NULL.");
+ goto out;
+ }
+ if (create && PageChecked(page)) {
+ ClearPageChecked(page);
+ tux3_da_release_blocks(inode->i_sb, 1);
+ }
+ if (page)
+ page_cache_release(page);
+
+out:
+ return tux3_get_block(inode, iblock, bh_rslt, create);
+}
+
+static int tux3_da_write_begin(struct file *file, struct address_space
*mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ tux3_get_block_delay);
+}
+
+static int tux3_da_write_end(struct file *file, struct address_space
*mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ if (!PageChecked(page)) {
+ int ret = tux3_da_reserve_blocks(mapping->host->i_sb, 1);
+ if (ret)
+ return ret;
+ SetPageChecked(page);
+ }
+ return nobh_write_end(file, mapping, pos, len, copied, page,
fsdata);
+}
+
+static int tux3_da_writepage(struct page *page, struct writeback_control
*wbc)
+{
+ return nobh_writepage(page, tux3_get_block_write, wbc);
+}
+static int tux3_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return mpage_writepages(mapping, wbc, tux3_get_block_write);
+}
+
+const struct address_space_operations tux_da_aops = {
+ .readpage = tux3_readpage,
+ .readpages = tux3_readpages,
+ .writepage = tux3_da_writepage,
+ .writepages = tux3_da_writepages,
+ .sync_page = block_sync_page,
+ .write_begin = tux3_da_write_begin,
+ .write_end = tux3_da_write_end,
+ .bmap = tux3_bmap,
+ .direct_IO = tux3_direct_IO,
+ .migratepage = buffer_migrate_page,
+};
+
#endif /* __KERNEL__ */
diff -pNur tux-orig/inode.c tux-hack/inode.c
--- tux-orig/inode.c 2009-01-14 20:00:22.000000000 +0800
+++ tux-hack/inode.c 2009-01-15 15:29:43.000000000 +0800
@@ -438,7 +438,7 @@ static void tux_setup_inode(struct inode
case S_IFREG:
inode->i_op = &tux_file_iops;
inode->i_fop = &tux_file_fops;
- inode->i_mapping->a_ops = &tux_aops;
+ inode->i_mapping->a_ops = &tux_da_aops;
break;
case S_IFDIR:
inode->i_op = &tux_dir_iops;
diff -pNur tux-orig/super.c tux-hack/super.c
--- tux-orig/super.c 2009-01-14 20:00:22.000000000 +0800
+++ tux-hack/super.c 2009-01-15 14:27:53.000000000 +0800
@@ -106,6 +106,9 @@ static void tux3_put_super(struct super_
iput(sbi->volmap);
iput(sbi->logmap);
+ /* destroy block allocation info */
+ tux3_balloc_info_destroy(sbi);
+
sb->s_fs_info = NULL;
kfree(sbi);
}
@@ -172,6 +175,10 @@ static int tux3_fill_super(struct super_
err = tux_load_sb(sb, silent);
if (err)
goto error;
+
+ /* initialize block allocation info */
+ tux3_balloc_info_init(sbi);
+
printk("%s: sb %p, ops %p, depth %Lu, block %Lu, entries_per_leaf
%d\n",
__func__,
sbi->itable.sb, sbi->itable.ops,
diff -pNur tux-orig/trace.h tux-hack/trace.h
--- tux-orig/trace.h 2009-01-14 20:00:22.000000000 +0800
+++ tux-hack/trace.h 2009-01-15 15:04:49.000000000 +0800
@@ -22,4 +22,15 @@
die(100); \
} while (0)
+
+#ifdef __KERNEL__
+/* debug macro, xiaofeng */
+#define xtrace(f, a...) { \
+ printk ("(%s, %d): %s:", \
+ __FILE__, __LINE__, __FUNCTION__); \
+ printk (f, ## a); \
+ printk ("\n"); \
+ }
+
+#endif
#endif
diff -pNur tux-orig/tux3.h tux-hack/tux3.h
--- tux-orig/tux3.h 2009-01-14 20:00:22.000000000 +0800
+++ tux-hack/tux3.h 2009-01-15 15:34:37.000000000 +0800
@@ -9,6 +9,8 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/mutex.h>
+#include <linux/mm.h>
+#include <linux/percpu_counter.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27)
#include <linux/cred.h> // fsuid
@@ -213,6 +215,13 @@ struct cursor {
} path[];
};
+/* Tux3 block allocation information */
+struct tux3_balloc_info {
+ struct percpu_counter freeblocks_counter;
+ /* nextalloc_counter, and others */
+};
+#define freeblocks_counter(sbi) (&sbi->balloc_info.freeblocks_counter)
+
/* Tux3-specific sb is a handle for the entire volume state */
struct sb {
@@ -241,6 +250,7 @@ struct sb {
struct mutex loglock; /* serialize log entries (spinlock me) */
#ifdef __KERNEL__
struct super_block *vfs_sb; /* Generic kernel superblock */
+ struct tux3_balloc_info balloc_info; /* control info for block
allocation */
#else
struct dev *dev; /* userspace block device */
#endif
@@ -620,6 +630,25 @@ static inline struct inode *buffer_inode
return buffer->b_page->mapping->host;
}
+static inline void tux3_balloc_info_init(struct sb* sbi)
+{
+ percpu_counter_init(freeblocks_counter(sbi), sbi->freeblocks);
+}
+static inline void tux3_balloc_info_destroy(struct sb* sbi)
+{
+ percpu_counter_destroy(freeblocks_counter(sbi));
+}
+
+static inline void tux3_release_blocks(struct sb* sbi, int count)
+{
+ percpu_counter_add(freeblocks_counter(sbi), count);
+}
+
+static inline void tux3_reserve_blocks(struct sb* sbi, int count)
+{
+ percpu_counter_sub(freeblocks_counter(sbi), count);
+}
+
/* btree.c */
struct buffer_head *cursor_leafbuf(struct cursor *cursor);
void release_cursor(struct cursor *cursor);
@@ -678,6 +707,7 @@ int tux3_get_block(struct inode *inode,
extern const struct address_space_operations tux_aops;
extern const struct address_space_operations tux_blk_aops;
extern const struct address_space_operations tux_vol_aops;
+extern const struct address_space_operations tux_da_aops;
/* iattr.c */
unsigned encode_asize(unsigned bits);
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://phunq.net/pipermail/tux3/attachments/20090115/5b20d959/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: tux3-da.patch
Type: text/x-patch
Size: 7636 bytes
Desc: not available
URL: <http://phunq.net/pipermail/tux3/attachments/20090115/5b20d959/attachment.bin>
-------------- next part --------------
_______________________________________________
Tux3 mailing list
Tux3 at tux3.org
http://mailman.tux3.org/cgi-bin/mailman/listinfo/tux3
More information about the Tux3
mailing list