[Tux3] Deferred delete prototype for Ext2

Daniel Phillips phillips at phunq.net
Tue Dec 2 20:16:13 PST 2008


I have talked about the benefits of deferring namespace operations such
as file create and unlink.  For Tux3, the main motivation is to remove
the need to share write access to inode table blocks between front end
filesystem operations and back end cache flushing.  There are other
good reasons for doing it too:

  * Take long-running filesystem operations outside the i_mutex
    directory lock, which is heavily contented in some situations

  * Maybe there are opportunities to batch up edits to directory
    blocks and perform them more efficiently

  * Can sometimes handle a create-delete sequence entirely in cache

Anyway, here is a very crude patch that just defers unlinks of regular
files under Ext2.  It relies on receiving an "fsync" to the directory to
actually remove the name from the directory entry block.  I wrote a
little fsync utility for doing that:

#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>

int main(int argc, char *argv[])
{
        if ((fsync(open(argv[1], O_RDONLY))) == -1) {
                printf("failed to fsync %s: %s (%i)\n", argv[1], strerror(errno), errno);
        }
        return 0;
}

Hirofumi showed how it can be done with dd too, a nice trick.  Anyway,
after a rm, the fsync has to be done by hand or the on-disk unlink will
never happen.  But if you ls the directory, it will seem to be gone and
it cannot be opened.  To be precise, the dentry for the file has been
turned into a "negative dentry" that says "this file does not exist",
and the negative dentry is locked into cache so that the filesystem
unlink can be done later.  Of course, Ext3 will keep track of
directories that have deferred unlinked dentries, and automatically do
the fsync.

Besides the dentry, the cached inode has to be freed if its link count
has reached zero.  We want to defer the inode free also, it is the
inode delete that causes the Tux3 layering problem described above.
And inode that needs to be deleted on disk is put onto a new list I
created on in the ext2-specific superblock.  I use the inode dirty list
link for this, which appears to be safe, and consistent with other vfs
usage.  At sync time (sys_sync) each inode on the superblock delete
list is deleted by a stripped down ext2 unlink function.

I only had to make one hack to core vfs to make this work.  Essentially,
we are changing the dentry cache from a volatile, read only cache to a
writeback cache which is entirely consistent on its own, as opposed to
just being an accelerator.  We always knew this was possible, because
that is how RAMFS works.  But I needed to make it work when the cache
is backed by a filesystem.  All I had to do here is add one new method,
to prevent the dentry cache from forcibly removing a dentry from the
cache that has been unlinked, but which is still in used for some hard
to understand reason.  With my new hook (d_ops->hide) the dentry just
stays in cache as a negative dentry and we take a reference count on it
to make sure it does not disappear before our code has a chance to find
it and remove the underlying directory entry.

We need to be able to tell the difference between a negative dentry
that still exists on the filesystem, and one that does not.  I added a
new dentry state flag, DCACHE_BACKED, which is set each time a dentry
is created due to a "real lookup" from the filesystem.  Later, when
the patch supports deferred create as well, it will be possible to have
dentries that are not backed by the filesystem (filesystem level create
is still deferred) and can be deleted without touching the filesystem.

Enjoy,

Daniel

p.s., this is hard :)

diff --git a/Makefile b/Makefile
index 56fb747..985a7ce 100644
--- a/Makefile
+++ b/Makefile
@@ -538,7 +538,7 @@ NOSTDINC_FLAGS += -nostdinc -isystem $(shell $(CC) -print-file-name=include)
 CHECKFLAGS     += $(NOSTDINC_FLAGS)
 
 # warn about C99 declaration after statement
-KBUILD_CFLAGS += $(call cc-option,-Wdeclaration-after-statement,)
+#KBUILD_CFLAGS += $(call cc-option,-Wdeclaration-after-statement,)
 
 # disable pointer signed / unsigned warnings in gcc 4.0
 KBUILD_CFLAGS += $(call cc-option,-Wno-pointer-sign,)
diff --git a/fs/dcache.c b/fs/dcache.c
index 6068c25..cd6a17e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1394,7 +1394,8 @@ void d_delete(struct dentry * dentry)
 	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	isdir = S_ISDIR(dentry->d_inode->i_mode);
-	if (atomic_read(&dentry->d_count) == 1) {
+	if ((dentry->d_op && dentry->d_op->d_hide && dentry->d_op->d_hide(dentry))
+	    || atomic_read(&dentry->d_count) == 1) {
 		dentry_iput(dentry);
 		fsnotify_nameremove(dentry, isdir);
 		return;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index a78c6b4..9888578 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -699,6 +699,32 @@ not_empty:
 	return 0;
 }
 
+int ext2_unlink_deferred(struct inode *dir, struct dentry *dentry);
+
+int ext2_sync_dir(struct file *file, struct dentry *dir, int datasync)
+{
+	printk(">>> ext2_sync_dir %p \"%.*s\"\n", dir, dir->d_name.len, dir->d_name.name);
+	struct list_head *next;
+	spin_lock(&dcache_lock);
+	for (next = dir->d_subdirs.next; next != &dir->d_subdirs;) {
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
+		if (d_unhashed(dentry))
+			continue;
+		show_dentry("dentry", dentry);
+		next = next->next;
+		if (d_negative(dentry) && (dentry->d_flags & DCACHE_BACKED)) {
+			show_dentry("deferred delete", dentry);
+			dentry->d_flags &= ~DCACHE_BACKED;
+			spin_unlock(&dcache_lock);
+			ext2_unlink_deferred(dir->d_inode, dentry);
+			dput(dentry);
+			spin_lock(&dcache_lock);
+		}
+	}
+	spin_unlock(&dcache_lock);
+	return ext2_sync_file(file, dir, datasync);
+}
+
 const struct file_operations ext2_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
@@ -707,5 +733,5 @@ const struct file_operations ext2_dir_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
 #endif
-	.fsync		= ext2_sync_file,
+	.fsync		= ext2_sync_dir,
 };
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 384fc0d..5e8af42 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -56,8 +56,9 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
 /*
  * Called at the last iput() if i_nlink is zero.
  */
-void ext2_delete_inode (struct inode * inode)
+void ext2_delete_inode(struct inode *inode)
 {
+	printk(">>> ext2_delete_inode\n");
 	truncate_inode_pages(&inode->i_data, 0);
 
 	if (is_bad_inode(inode))
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 80c97fd..d33b14c 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -36,10 +36,22 @@
 #include "acl.h"
 #include "xip.h"
 
+static int ext2_hide_dentry(struct dentry *dentry)
+{
+	show_dentry("hide dentry", dentry);
+	dget(dentry);
+	return 1;
+}
+
+static struct dentry_operations ext2_dentry_operations = {
+	.d_hide = ext2_hide_dentry,
+};
+
 static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
 {
 	int err = ext2_add_link(dentry, inode);
 	if (!err) {
+		dentry->d_op = &ext2_dentry_operations;
 		d_instantiate(dentry, inode);
 		return 0;
 	}
@@ -67,6 +79,8 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
 		if (IS_ERR(inode))
 			return ERR_CAST(inode);
 	}
+	dentry->d_flags |= DCACHE_BACKED;
+	dentry->d_op = &ext2_dentry_operations;
 	return d_splice_alias(inode, dentry);
 }
 
@@ -237,6 +251,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 	if (err)
 		goto out_fail;
 
+	dentry->d_op = &ext2_dentry_operations;
 	d_instantiate(dentry, inode);
 out:
 	return err;
@@ -250,6 +265,21 @@ out_dir:
 	goto out;
 }
 
+static int defer_unlink(struct inode *dir, struct dentry *dentry)
+{
+	show_dentry("defer unlink", dentry);
+	dentry->d_inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(dentry->d_inode);
+	return 0;
+}
+
+int ext2_unlink_deferred(struct inode *dir, struct dentry *dentry)
+{
+	struct page *page;
+	struct ext2_dir_entry_2 *de = ext2_find_entry(dir, dentry, &page);
+	return de ? ext2_delete_entry(de, page) : -ENOENT;
+}
+
 static int ext2_unlink(struct inode * dir, struct dentry *dentry)
 {
 	struct inode * inode = dentry->d_inode;
@@ -377,7 +407,7 @@ const struct inode_operations ext2_dir_inode_operations = {
 	.create		= ext2_create,
 	.lookup		= ext2_lookup,
 	.link		= ext2_link,
-	.unlink		= ext2_unlink,
+	.unlink		= defer_unlink,
 	.symlink	= ext2_symlink,
 	.mkdir		= ext2_mkdir,
 	.rmdir		= ext2_rmdir,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ef50cbc..ec23c7a 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -295,17 +295,52 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, siz
 static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
 #endif
 
+extern spinlock_t inode_lock;
+
+static inline void show_inode(char *tag, struct inode *inode)
+{
+	printk(">>> %s: %p/%i %x\n", tag, inode,
+		atomic_read(&inode->i_count), inode->i_flags);
+}
+
+static void defer_drop_inode(struct inode *inode)
+{
+	if (inode->i_nlink)
+		generic_drop_inode(inode);
+	else {
+		show_inode("defer delete", inode);
+		inode->i_state |= I_DIRTY;
+		list_move(&inode->i_list, &EXT2_SB(inode->i_sb)->delete);
+		spin_unlock(&inode_lock);
+	}
+}
+
+extern struct list_head inode_unused;
+
+static int ext2_sync_fs(struct super_block *sb, int wait)
+{
+	while (!list_empty(&EXT2_SB(sb)->delete)) {
+		struct inode *inode = list_entry(EXT2_SB(sb)->delete.next, struct inode, i_list);
+		show_inode(">>> delete deferred", inode);
+		spin_lock(&inode_lock);
+		generic_delete_inode(inode); /* removes from list and drops lock */
+	}
+	return 0;
+}
+
 static const struct super_operations ext2_sops = {
 	.alloc_inode	= ext2_alloc_inode,
 	.destroy_inode	= ext2_destroy_inode,
 	.write_inode	= ext2_write_inode,
 	.delete_inode	= ext2_delete_inode,
+	.drop_inode	= defer_drop_inode,
 	.put_super	= ext2_put_super,
 	.write_super	= ext2_write_super,
 	.statfs		= ext2_statfs,
 	.remount_fs	= ext2_remount,
 	.clear_inode	= ext2_clear_inode,
 	.show_options	= ext2_show_options,
+	.sync_fs	= ext2_sync_fs,
 #ifdef CONFIG_QUOTA
 	.quota_read	= ext2_quota_read,
 	.quota_write	= ext2_quota_write,
@@ -614,6 +649,7 @@ static int ext2_setup_super (struct super_block * sb,
 			EXT2_BLOCKS_PER_GROUP(sb),
 			EXT2_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt);
+	INIT_LIST_HEAD(&sbi->delete);
 	return res;
 }
 
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index d982eb8..ee77f72 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -100,6 +100,7 @@ struct dentry {
 		struct list_head d_child;	/* child of parent list */
 	 	struct rcu_head d_rcu;
 	} d_u;
+	struct list_head d_defer;	/* deferred namespace operations */
 	struct list_head d_subdirs;	/* our children */
 	struct list_head d_alias;	/* inode alias list */
 	unsigned long d_time;		/* used by d_revalidate */
@@ -133,6 +134,7 @@ struct dentry_operations {
 	void (*d_release)(struct dentry *);
 	void (*d_iput)(struct dentry *, struct inode *);
 	char *(*d_dname)(struct dentry *, char *, int);
+	int (*d_hide)(struct dentry *);
 };
 
 /* the dentry parameter passed to d_hash and d_compare is the parent
@@ -175,6 +177,8 @@ d_iput:		no		no		no       yes
 #define DCACHE_UNHASHED		0x0010	
 
 #define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched */
+#define DCACHE_BACKED		0x0020 /* FS has dirent */
+#define DCACHE_WRONG		0x0040 /* FS has wrong dirent */
 
 extern spinlock_t dcache_lock;
 extern seqlock_t rename_lock;
@@ -341,6 +345,11 @@ static inline int d_unhashed(struct dentry *dentry)
 	return (dentry->d_flags & DCACHE_UNHASHED);
 }
 
+static inline int d_negative(struct dentry *dentry)
+{
+	return !dentry->d_inode;
+}
+
 static inline struct dentry *dget_parent(struct dentry *dentry)
 {
 	struct dentry *ret;
@@ -363,4 +372,11 @@ extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
 
 extern int sysctl_vfs_cache_pressure;
 
+static inline void show_dentry(char *tag, struct dentry *dentry)
+{
+	printk(">>> %s: %p/%i %x \"%.*s\"\n", tag, dentry,
+		atomic_read(&dentry->d_count), dentry->d_flags,
+		dentry->d_name.len, dentry->d_name.name);
+}
+
 #endif	/* __LINUX_DCACHE_H */
diff --git a/include/linux/ext2_fs_sb.h b/include/linux/ext2_fs_sb.h
index f273415..54ef185 100644
--- a/include/linux/ext2_fs_sb.h
+++ b/include/linux/ext2_fs_sb.h
@@ -106,6 +106,7 @@ struct ext2_sb_info {
 	spinlock_t s_rsv_window_lock;
 	struct rb_root s_rsv_window_root;
 	struct ext2_reserve_window_node s_rsv_window_head;
+	struct list_head delete;
 };
 
 #endif	/* _LINUX_EXT2_FS_SB */

_______________________________________________
Tux3 mailing list
Tux3 at tux3.org
http://tux3.org/cgi-bin/mailman/listinfo/tux3



More information about the Tux3 mailing list