[Tux3] Rough draft of atomic commit for userspace
Daniel Phillips
phillips at phunq.net
Sun Feb 1 22:36:27 PST 2009
It compiles, it has not been tried. There are probably mistakes and
omissions.
There are two new lists in the superblock:
sb->pinned ... dirty btree nodes and bitmap blocks written per rollup
sb->commit ... dirty blocks written per delta
This code implements a periodic rollup flush, which mainly moves blocks
from the pinned list to the commit list, and a delta flush, which
writes out the dirty metadata, then writes out the log blocks.
diff -r 4d5b59994ae7 user/buffer.c
--- a/user/buffer.c Wed Jan 28 21:19:05 2009 -0800
+++ b/user/buffer.c Sun Feb 01 22:33:25 2009 -0800
@@ -299,7 +299,7 @@ struct buffer_head *blockread(map_t *map
return buffer;
}
-int blockdirty(struct buffer_head *buffer, unsigned newdelta)
+int blockdirty(struct buffer_head *buffer, unsigned newdelta, struct list_head *forked)
{
unsigned oldstate = buffer->state;
assert(oldstate < BUFFER_STATES);
@@ -316,7 +316,7 @@ int blockdirty(struct buffer_head *buffe
buffer->data = clone->data;
clone->data = data;
clone->index = buffer->index;
- set_buffer_state(clone, oldstate);
+ set_buffer_state_list(clone, oldstate, forked);
brelse(clone);
}
set_buffer_state_list(buffer, BUFFER_DIRTY + newdelta, &buffer->map->dirty);
diff -r 4d5b59994ae7 user/buffer.h
--- a/user/buffer.h Wed Jan 28 21:19:05 2009 -0800
+++ b/user/buffer.h Sun Feb 01 22:33:25 2009 -0800
@@ -57,7 +57,7 @@ struct buffer_head *peekblk(map_t *map,
struct buffer_head *peekblk(map_t *map, block_t block);
struct buffer_head *blockget(map_t *map, block_t block);
struct buffer_head *blockread(map_t *map, block_t block);
-int blockdirty(struct buffer_head *buffer, unsigned newdelta);
+int blockdirty(struct buffer_head *buffer, unsigned newdelta, struct list_head *forked);
int flush_buffers(map_t *map);
int flush_state(unsigned state);
void evict_buffers(map_t *map);
diff -r 4d5b59994ae7 user/commit.c
--- a/user/commit.c Wed Jan 28 21:19:05 2009 -0800
+++ b/user/commit.c Sun Feb 01 22:33:25 2009 -0800
@@ -58,10 +58,18 @@ void replay(struct sb *sb)
}
}
+/* Delta commit and log flush */
+
static int need_delta(struct sb *sb)
{
static unsigned crudehack;
return !(++crudehack % 10);
+}
+
+static int need_flush(struct sb *sb)
+{
+ static unsigned crudehack;
+ return !(++crudehack % 3);
}
int write_bitmap(struct buffer_head *buffer)
@@ -75,14 +83,33 @@ int write_bitmap(struct buffer_head *buf
if (buffer->state - BUFFER_DIRTY == (sb->delta & (BUFFER_DIRTY_STATES - 1)))
return -EAGAIN;
trace("write bitmap %Lx", (L)buffer->index);
- if (!(err = diskwrite(sb->dev->fd, buffer->data, sb->blocksize, seg.block)))
+ if (!(err = diskwrite(sb->dev->fd, buffer->data, sb->blocksize, seg.block << sb->blockbits)))
set_buffer_clean(buffer);
return 0;
}
-static int stage_delta(struct sb *sb)
+int move_deferred(struct sb *sb, u64 val)
{
- assert(sb->dev->bits >= 8 && sb->dev->fd);
+ return stash_value(&sb->defree, val);
+}
+
+static int flush_log(struct sb *sb)
+{
+ /*
+ * Flush a snapshot of the allocation map to disk. Physical blocks for
+ * the bitmaps and new or redirected bitmap btree nodes may be allocated
+ * during the flush. Any bitmap blocks that are (re)dirtied by these
+ * allocations will be written out in the next flush cycle. A redirtied
+ * bitmap block is replaced in cache by a clone, which is then modified.
+ * The original goes onto a list of forked blocks to be written out
+ * separately.
+ */
+
+ /* further block allocations belong to the next cycle */
+ sb->flush++;
+
+ /* map dirty bitmap blocks to disk and write out */
+
struct buffer_head *buffer, *safe;
struct list_head *head = &mapping(sb->bitmap)->dirty;
list_for_each_entry_safe(buffer, safe, head, link) {
@@ -90,6 +117,63 @@ static int stage_delta(struct sb *sb)
if (err != -EAGAIN)
return err;
}
+
+ /* add pinned metadata to delta list */
+ /* forked bitmap blocks from blockdirty, redirected btree nodes from cursor_redirect */
+ list_splice_tail(&sb->pinned, &sb->commit);
+
+ /* move deferred frees for rollup to delta deferred free list */
+ unstash(sb, &sb->defree, move_deferred);
+
+ /* empty the log */
+ sb->logbase = sb->lognext;
+
+ return 0;
+}
+
+static int stage_delta(struct sb *sb)
+{
+ if (need_flush(sb)) {
+ int err = flush_log(sb);
+ if (err)
+ return err;
+ }
+
+ /* flush forked bitmap blocks, btree node and leaf blocks */
+
+ while (!list_empty(&sb->commit)) {
+ struct buffer_head *buffer = container_of(sb->commit.next, struct buffer_head, link);
+ // mapping, index set but not hashed in mapping
+ buffer->map->io(buffer, 1);
+ brelse(buffer);
+ }
+
+ sb->delta++;
+
+ /* allocate and write log blocks */
+
+ for (unsigned logblock = sb->logthis; sb->lognext; logblock++) {
+ block_t block;
+ int err = balloc(sb, 1, &block);
+ if (err)
+ return err;
+ struct buffer_head *buffer = blockget(mapping(sb->logmap), block);
+ if (!buffer) {
+ bfree(sb, block, 1);
+ return -ENOMEM;
+ };
+ struct logblock *log = bufdata(buffer);
+ log->prevlog = to_be_u64(sb->prevlog);
+ if ((err = diskwrite(sb->dev->fd, buffer->data, sb->blocksize, block << sb->blockbits))) {
+ brelse(buffer);
+ bfree(sb, block, 1);
+ return err;
+ }
+ defer_free(&sb->loghold, bufindex(buffer), 1);
+ brelse(buffer);
+ sb->prevlog = block;
+ }
+
return 0;
}
@@ -165,7 +249,9 @@ int main(int argc, char *argv[])
struct tux_iattr iattr = { .mode = S_IFREG | S_IRWXU };
char name[100];
snprintf(name, sizeof(name), "file%i", i);
+ change_begin(sb);
free_inode(tuxcreate(sb->rootdir, name, strlen(name), &iattr));
+ change_end(sb);
}
assert(!tuxsync(sb->rootdir));
sb->super = (struct disksuper){ .magic = SB_MAGIC, .volblocks = to_be_u64(sb->blockbits) };
diff -r 4d5b59994ae7 user/kernel/balloc.c
--- a/user/kernel/balloc.c Wed Jan 28 21:19:05 2009 -0800
+++ b/user/kernel/balloc.c Sun Feb 01 22:33:25 2009 -0800
@@ -229,7 +229,7 @@ static block_t balloc_from_range(struct
goto final_partial_byte;
}
found -= run - 1;
- blockdirty(buffer, sb->delta);
+ blockdirty(buffer, sb->flush, &sb->pinned);
set_bits(bufdata(buffer), found & mapmask, run);
mark_buffer_dirty(buffer);
brelse(buffer);
@@ -278,7 +278,7 @@ int bfree(struct sb *sb, block_t start,
mutex_lock_nested(&sb->bitmap->i_mutex, I_MUTEX_BITMAP);
if (!all_set(bufdata(buffer), start &= mapmask, blocks))
goto eeek;
- blockdirty(buffer, sb->delta);
+ blockdirty(buffer, sb->flush, &sb->pinned);
clear_bits(bufdata(buffer), start, blocks);
mark_buffer_dirty(buffer);
brelse(buffer);
diff -r 4d5b59994ae7 user/kernel/btree.c
--- a/user/kernel/btree.c Wed Jan 28 21:19:05 2009 -0800
+++ b/user/kernel/btree.c Sun Feb 01 22:33:25 2009 -0800
@@ -348,6 +348,9 @@ int cursor_redirect(struct cursor *curso
level_redirect_brelse(cursor, level, clone);
log_redirect(sb, oldblock, newblock);
defer_free(&sb->defree, oldblock, 1);
+
+ if (level < btree->root.depth)
+ list_move_tail(&clone->link, &sb->pinned);
if (!level--) {
trace("redirect root");
diff -r 4d5b59994ae7 user/kernel/log.c
--- a/user/kernel/log.c Wed Jan 28 21:19:05 2009 -0800
+++ b/user/kernel/log.c Sun Feb 01 22:33:25 2009 -0800
@@ -147,7 +147,14 @@ int defer_free(struct stash *defree, blo
return stash_value(defree, ((u64)count << 48) + block);
}
-int retire_frees(struct sb *sb, struct stash *defree)
+typedef int (*retire_actor_t)(struct sb *sb, u64 val);
+
+int retire_bfree(struct sb *sb, u64 val)
+{
+ return bfree(sb, val & ~(-1ULL << 48), val >> 48);
+}
+
+int unstash(struct sb *sb, struct stash *defree, retire_actor_t actor)
{
struct flink_head *head = &defree->head;
struct page *page;
@@ -160,7 +167,7 @@ int retire_frees(struct sb *sb, struct s
if (top == defree->top)
top = defree->pos;
for (; vec < top; vec++)
- if ((err = bfree(sb, *vec & ~(-1ULL << 48), *vec >> 48)))
+ if ((err = actor(sb, *vec)))
return err;
if (flink_is_last(head))
break;
@@ -171,6 +178,11 @@ int retire_frees(struct sb *sb, struct s
return 0;
}
+int retire_frees(struct sb *sb, struct stash *defree, retire_actor_t *actor)
+{
+ return unstash(sb, defree, retire_bfree);
+}
+
void destroy_defree(struct stash *defree)
{
empty_stash(defree);
diff -r 4d5b59994ae7 user/kernel/tux3.h
--- a/user/kernel/tux3.h Wed Jan 28 21:19:05 2009 -0800
+++ b/user/kernel/tux3.h Sun Feb 01 22:33:25 2009 -0800
@@ -303,7 +303,8 @@ struct sb {
struct inode *rootdir; /* root directory special file */
struct inode *vtable; /* version table special file */
struct inode *atable; /* xattr atom special file */
- unsigned delta; /* delta commit counter */
+ unsigned delta; /* delta commit cycle */
+ unsigned flush; /* log flush cycle */
struct rw_semaphore delta_lock; /* delta transition exclusive */
unsigned blocksize, blockbits, blockmask;
block_t volblocks, freeblocks, nextalloc;
@@ -314,13 +315,18 @@ struct sb {
unsigned freeatom; /* Start of free atom list in atom table */
unsigned atomgen; /* Next atom number to allocate if no free atoms */
loff_t dictsize; /* Atom dictionary size */
- struct inode *logmap; /* Prototype log block cache */
+ struct inode *logmap; /* Log block cache */
+ block_t prevlog; /* Previous log block physical address */
unsigned logbase; /* Index of oldest log block in log map */
+ unsigned logthis; /* Index of first log block in delta */
unsigned lognext; /* Index of next log block in log map */
struct buffer_head *logbuf; /* Cached log block */
unsigned char *logpos, *logtop; /* Where to emit next log entry */
struct mutex loglock; /* serialize log entries (spinlock me) */
struct stash defree; /* defer extent frees until affer commit */
+ struct stash loghold; /* defer extent frees until affer log flush */
+ struct list_head pinned; /* dirty metadata not flushed per delta */
+ struct list_head commit; /* dirty metadata flushed per delta */
#ifdef __KERNEL__
struct super_block *vfs_sb; /* Generic kernel superblock */
#else
diff -r 4d5b59994ae7 user/list.h
--- a/user/list.h Wed Jan 28 21:19:05 2009 -0800
+++ b/user/list.h Sun Feb 01 22:33:25 2009 -0800
@@ -70,6 +70,32 @@ static inline int list_empty(const struc
static inline int list_empty(const struct list_head *head)
{
return head->next == head;
+}
+
+static inline void __list_splice(const struct list_head *list,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ struct list_head *first = list->next;
+ struct list_head *last = list->prev;
+
+ first->prev = prev;
+ prev->next = first;
+
+ last->next = next;
+ next->prev = last;
+}
+
+static inline void list_splice(const struct list_head *list, struct list_head *head)
+{
+ if (!list_empty(list))
+ __list_splice(list, head, head->next);
+}
+
+static inline void list_splice_tail(struct list_head *list, struct list_head *head)
+{
+ if (!list_empty(list))
+ __list_splice(list, head->prev, head);
}
#define container_of(ptr, type, member) ({ \
_______________________________________________
Tux3 mailing list
Tux3 at tux3.org
http://mailman.tux3.org/cgi-bin/mailman/listinfo/tux3
More information about the Tux3
mailing list