[Tux3] Rough draft of atomic commit for userspace

Daniel Phillips phillips at phunq.net
Sun Feb 1 22:36:27 PST 2009


It compiles, it has not been tried.  There are probably mistakes and
omissions.

There are two new lists in the superblock:

  sb->pinned ... dirty btree nodes and bitmap blocks written per rollup
  sb->commit ... dirty blocks written per delta

This code implements a periodic rollup flush, which mainly moves blocks
from the pinned list to the commit list, and a delta flush, which
writes out the dirty metadata, then writes out the log blocks.

diff -r 4d5b59994ae7 user/buffer.c
--- a/user/buffer.c	Wed Jan 28 21:19:05 2009 -0800
+++ b/user/buffer.c	Sun Feb 01 22:33:25 2009 -0800
@@ -299,7 +299,7 @@ struct buffer_head *blockread(map_t *map
 	return buffer;
 }
 
-int blockdirty(struct buffer_head *buffer, unsigned newdelta)
+int blockdirty(struct buffer_head *buffer, unsigned newdelta, struct list_head *forked)
 {
 	unsigned oldstate = buffer->state;
 	assert(oldstate < BUFFER_STATES);
@@ -316,7 +316,7 @@ int blockdirty(struct buffer_head *buffe
 		buffer->data = clone->data;
 		clone->data = data;
 		clone->index = buffer->index;
-		set_buffer_state(clone, oldstate);
+		set_buffer_state_list(clone, oldstate, forked);
 		brelse(clone);
 	}
 	set_buffer_state_list(buffer, BUFFER_DIRTY + newdelta, &buffer->map->dirty);
diff -r 4d5b59994ae7 user/buffer.h
--- a/user/buffer.h	Wed Jan 28 21:19:05 2009 -0800
+++ b/user/buffer.h	Sun Feb 01 22:33:25 2009 -0800
@@ -57,7 +57,7 @@ struct buffer_head *peekblk(map_t *map, 
 struct buffer_head *peekblk(map_t *map, block_t block);
 struct buffer_head *blockget(map_t *map, block_t block);
 struct buffer_head *blockread(map_t *map, block_t block);
-int blockdirty(struct buffer_head *buffer, unsigned newdelta);
+int blockdirty(struct buffer_head *buffer, unsigned newdelta, struct list_head *forked);
 int flush_buffers(map_t *map);
 int flush_state(unsigned state);
 void evict_buffers(map_t *map);
diff -r 4d5b59994ae7 user/commit.c
--- a/user/commit.c	Wed Jan 28 21:19:05 2009 -0800
+++ b/user/commit.c	Sun Feb 01 22:33:25 2009 -0800
@@ -58,10 +58,18 @@ void replay(struct sb *sb)
 	}
 }
 
+/* Delta commit and log flush */
+
 static int need_delta(struct sb *sb)
 {
 	static unsigned crudehack;
 	return !(++crudehack % 10);
+}
+
+static int need_flush(struct sb *sb)
+{
+	static unsigned crudehack;
+	return !(++crudehack % 3);
 }
 
 int write_bitmap(struct buffer_head *buffer)
@@ -75,14 +83,33 @@ int write_bitmap(struct buffer_head *buf
 	if (buffer->state - BUFFER_DIRTY == (sb->delta & (BUFFER_DIRTY_STATES - 1)))
 		return -EAGAIN;
 	trace("write bitmap %Lx", (L)buffer->index);
-	if (!(err = diskwrite(sb->dev->fd, buffer->data, sb->blocksize, seg.block)))
+	if (!(err = diskwrite(sb->dev->fd, buffer->data, sb->blocksize, seg.block << sb->blockbits)))
 		set_buffer_clean(buffer);
 	return 0;
 }
 
-static int stage_delta(struct sb *sb)
+int move_deferred(struct sb *sb, u64 val)
 {
-	assert(sb->dev->bits >= 8 && sb->dev->fd);
+	return stash_value(&sb->defree, val);
+}
+
+static int flush_log(struct sb *sb)
+{
+	/*
+	 * Flush a snapshot of the allocation map to disk.  Physical blocks for
+	 * the bitmaps and new or redirected bitmap btree nodes may be allocated
+	 * during the flush.  Any bitmap blocks that are (re)dirtied by these
+	 * allocations will be written out in the next flush cycle.  A redirtied
+	 * bitmap block is replaced in cache by a clone, which is then modified.
+	 * The original goes onto a list of forked blocks to be written out
+	 * separately.
+	 */
+
+	/* further block allocations belong to the next cycle */
+	sb->flush++;
+
+	/* map dirty bitmap blocks to disk and write out */
+
 	struct buffer_head *buffer, *safe;
 	struct list_head *head = &mapping(sb->bitmap)->dirty;
 	list_for_each_entry_safe(buffer, safe, head, link) {
@@ -90,6 +117,63 @@ static int stage_delta(struct sb *sb)
 		if (err != -EAGAIN)
 			return err;
 	}
+
+	/* add pinned metadata to delta list */
+	/* forked bitmap blocks from blockdirty, redirected btree nodes from cursor_redirect */
+	list_splice_tail(&sb->pinned, &sb->commit);
+
+	/* move deferred frees for rollup to delta deferred free list */
+	unstash(sb, &sb->defree, move_deferred);
+
+	/* empty the log */
+	sb->logbase = sb->lognext;
+
+	return 0;
+}
+
+static int stage_delta(struct sb *sb)
+{
+	if (need_flush(sb)) {
+		int err = flush_log(sb);
+		if (err)
+			return err;
+	}
+
+	/* flush forked bitmap blocks, btree node and leaf blocks */
+
+	while (!list_empty(&sb->commit)) {
+		struct buffer_head *buffer = container_of(sb->commit.next, struct buffer_head, link);
+		// mapping, index set but not hashed in mapping
+		buffer->map->io(buffer, 1);
+		brelse(buffer);
+	}
+
+	sb->delta++;
+
+	/* allocate and write log blocks */
+
+	for (unsigned logblock = sb->logthis; sb->lognext; logblock++) {
+		block_t block;
+		int err = balloc(sb, 1, &block);
+		if (err)
+			return err;
+		struct buffer_head *buffer = blockget(mapping(sb->logmap), block);
+		if (!buffer) {
+			bfree(sb, block, 1);
+			return -ENOMEM;
+		};
+		struct logblock *log = bufdata(buffer);
+		log->prevlog = to_be_u64(sb->prevlog);
+		if ((err = diskwrite(sb->dev->fd, buffer->data, sb->blocksize, block << sb->blockbits))) {
+			brelse(buffer);
+			bfree(sb, block, 1);
+			return err;
+		}
+		defer_free(&sb->loghold, bufindex(buffer), 1);
+		brelse(buffer);
+		sb->prevlog = block;
+	}
+
 	return 0;
 }
 
@@ -165,7 +249,9 @@ int main(int argc, char *argv[])
 			struct tux_iattr iattr = { .mode = S_IFREG | S_IRWXU };
 			char name[100];
 			snprintf(name, sizeof(name), "file%i", i);
+			change_begin(sb);
 			free_inode(tuxcreate(sb->rootdir, name, strlen(name), &iattr));
+			change_end(sb);
 		}
 		assert(!tuxsync(sb->rootdir));
 		sb->super = (struct disksuper){ .magic = SB_MAGIC, .volblocks = to_be_u64(sb->blockbits) };
diff -r 4d5b59994ae7 user/kernel/balloc.c
--- a/user/kernel/balloc.c	Wed Jan 28 21:19:05 2009 -0800
+++ b/user/kernel/balloc.c	Sun Feb 01 22:33:25 2009 -0800
@@ -229,7 +229,7 @@ static block_t balloc_from_range(struct 
 					goto final_partial_byte;
 				}
 				found -= run - 1;
-				blockdirty(buffer, sb->delta);
+				blockdirty(buffer, sb->flush, &sb->pinned);
 				set_bits(bufdata(buffer), found & mapmask, run);
 				mark_buffer_dirty(buffer);
 				brelse(buffer);
@@ -278,7 +278,7 @@ int bfree(struct sb *sb, block_t start, 
 	mutex_lock_nested(&sb->bitmap->i_mutex, I_MUTEX_BITMAP);
 	if (!all_set(bufdata(buffer), start &= mapmask, blocks))
 		goto eeek;
-	blockdirty(buffer, sb->delta);
+	blockdirty(buffer, sb->flush, &sb->pinned);
 	clear_bits(bufdata(buffer), start, blocks);
 	mark_buffer_dirty(buffer);
 	brelse(buffer);
diff -r 4d5b59994ae7 user/kernel/btree.c
--- a/user/kernel/btree.c	Wed Jan 28 21:19:05 2009 -0800
+++ b/user/kernel/btree.c	Sun Feb 01 22:33:25 2009 -0800
@@ -348,6 +348,9 @@ int cursor_redirect(struct cursor *curso
 		level_redirect_brelse(cursor, level, clone);
 		log_redirect(sb, oldblock, newblock);
 		defer_free(&sb->defree, oldblock, 1);
+
+		if (level < btree->root.depth)
+			list_move_tail(&clone->link, &sb->pinned);
 
 		if (!level--) {
 			trace("redirect root");
diff -r 4d5b59994ae7 user/kernel/log.c
--- a/user/kernel/log.c	Wed Jan 28 21:19:05 2009 -0800
+++ b/user/kernel/log.c	Sun Feb 01 22:33:25 2009 -0800
@@ -147,7 +147,14 @@ int defer_free(struct stash *defree, blo
 	return stash_value(defree, ((u64)count << 48) + block);
 }
 
-int retire_frees(struct sb *sb, struct stash *defree)
+typedef int (*retire_actor_t)(struct sb *sb, u64 val);
+
+int retire_bfree(struct sb *sb, u64 val)
+{
+	return bfree(sb, val & ~(-1ULL << 48), val >> 48);
+}
+
+int unstash(struct sb *sb, struct stash *defree, retire_actor_t actor)
 {
 	struct flink_head *head = &defree->head;
 	struct page *page;
@@ -160,7 +167,7 @@ int retire_frees(struct sb *sb, struct s
 		if (top == defree->top)
 			top = defree->pos;
 		for (; vec < top; vec++)
-			if ((err = bfree(sb, *vec & ~(-1ULL << 48), *vec >> 48)))
+			if ((err = actor(sb, *vec)))
 				return err;
 		if (flink_is_last(head))
 			break;
@@ -171,6 +178,11 @@ int retire_frees(struct sb *sb, struct s
 	return 0;
 }
 
+int retire_frees(struct sb *sb, struct stash *defree, retire_actor_t *actor)
+{
+	return unstash(sb, defree, retire_bfree);
+}
+
 void destroy_defree(struct stash *defree)
 {
 	empty_stash(defree);
diff -r 4d5b59994ae7 user/kernel/tux3.h
--- a/user/kernel/tux3.h	Wed Jan 28 21:19:05 2009 -0800
+++ b/user/kernel/tux3.h	Sun Feb 01 22:33:25 2009 -0800
@@ -303,7 +303,8 @@ struct sb {
 	struct inode *rootdir;	/* root directory special file */
 	struct inode *vtable;	/* version table special file */
 	struct inode *atable;	/* xattr atom special file */
-	unsigned delta;		/* delta commit counter */
+	unsigned delta;		/* delta commit cycle */
+	unsigned flush;		/* log flush cycle */
 	struct rw_semaphore delta_lock; /* delta transition exclusive */
 	unsigned blocksize, blockbits, blockmask;
 	block_t volblocks, freeblocks, nextalloc;
@@ -314,13 +315,18 @@ struct sb {
 	unsigned freeatom;	/* Start of free atom list in atom table */
 	unsigned atomgen;	/* Next atom number to allocate if no free atoms */
 	loff_t dictsize;	/* Atom dictionary size */
-	struct inode *logmap;	/* Prototype log block cache */
+	struct inode *logmap;	/* Log block cache */
+	block_t prevlog;	/* Previous log block physical address */
 	unsigned logbase;	/* Index of oldest log block in log map */
+	unsigned logthis;	/* Index of first log block in delta */
 	unsigned lognext;	/* Index of next log block in log map */
 	struct buffer_head *logbuf; /* Cached log block */
 	unsigned char *logpos, *logtop; /* Where to emit next log entry */
 	struct mutex loglock;	/* serialize log entries (spinlock me) */
 	struct stash defree;	/* defer extent frees until affer commit */
+	struct stash loghold;	/* defer extent frees until affer log flush */
+	struct list_head pinned; /* dirty metadata not flushed per delta */
+	struct list_head commit; /* dirty metadata flushed per delta */
 #ifdef __KERNEL__
 	struct super_block *vfs_sb; /* Generic kernel superblock */
 #else
diff -r 4d5b59994ae7 user/list.h
--- a/user/list.h	Wed Jan 28 21:19:05 2009 -0800
+++ b/user/list.h	Sun Feb 01 22:33:25 2009 -0800
@@ -70,6 +70,32 @@ static inline int list_empty(const struc
 static inline int list_empty(const struct list_head *head)
 {
 	return head->next == head;
+}
+
+static inline void __list_splice(const struct list_head *list,
+				 struct list_head *prev,
+				 struct list_head *next)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+
+	first->prev = prev;
+	prev->next = first;
+
+	last->next = next;
+	next->prev = last;
+}
+
+static inline void list_splice(const struct list_head *list, struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head, head->next);
+}
+
+static inline void list_splice_tail(struct list_head *list, struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head->prev, head);
 }
 
 #define container_of(ptr, type, member) ({ \

_______________________________________________
Tux3 mailing list
Tux3 at tux3.org
http://mailman.tux3.org/cgi-bin/mailman/listinfo/tux3



More information about the Tux3 mailing list