[Tux3] Polymorphic blockread for kernel
Daniel Phillips
phillips at phunq.net
Wed Jan 14 22:55:24 PST 2009
Here is a polymorphic blockread for kernel along the lines of the Tux3
userspace blockread, which has a a per-mapping blockio method to read or
write the buffer. We use this technique in userspace to abstract away
the difference between physically and logically mapped inodes for the
purpose of reading or writing single blocks. Recently, this technique
found another application: it helps solve the "allocation during bitmap
flush" problem I wrote about earlier. This is what made me decide that
the techique is useful enough to implement in kernel. To be sure, the
same effect can be accomplished by other means, but probably not as
tidily.
For simplicity, the blockio method is store in the inode->i_private
field, which is otherwise unused by Tux3 or any other block filesystem
on Linux, because we now have a superior mechanism for subclassing
inodes that does not require an additional memory allocation per inode.
Putting a function pointer in the field is somewhat unusual, however the
comment /* fs or device private pointer */ seems to allow this.
Blockread is defined in terms of blockget, and is very simple:
struct buffer_head *blockread(struct address_space *mapping, block_t block)
{
struct buffer_head *buffer = blockget(mapping, block);
if (!IS_ERR(buffer) && !buffer_uptodate(buffer)) {
int err = ((blockio_t *)mapping->host->i_private)(buffer, READ);
if (err) {
brelse(buffer);
return ERR_PTR(err);
}
}
return buffer;
}
Note: the userspace and kernel version of buffer_uptodate are subtly
different in a way that will cause problems: in kernel, buffer_uptodate
is true if a buffer is clean (contents same as disk) or dirty (new
contents not yet transferred to disk). In userspace, buffer_uptodate
means clean. So we need to rename the userspace version as
buffer_clean and implement a buffer_clean in kernel that checks both
the uptodate and dirty flags.
Reading the block if it is not present in cache is done by syncio(), a
lightweight bio transfer wrapper I developed earlier with inspiration
from Maciej:
int dev_blockio(struct buffer_head *buffer, int write)
{
struct page *page = buffer->b_page;
unsigned offset = offset_in_page(buffer->b_data);
return syncio(write, page->mapping->host->i_sb->s_bdev,
((sector_t)page->index << (PAGE_CACHE_SHIFT - 9)) + (offset >> 9),
1, &(struct bio_vec){
.bv_page = page, .bv_offset = offset,
.bv_len = buffer->b_size });
}
We just pass a pointer to a single bio_vec in this case. The bio_vec
will actually be copied to the real bio, which could be considered a
flaw in this interface, however it is only 8 bytes per biovec, a tiny
fraction of the cost of the transfer itself.
So we can see the whole IO path clearly here. It is very direct: find
or create a buffer in page cache; if the buffer found needs to be read,
submit a bio and wait on it. This is all very close to the metal, much
more so than going through the block IO library with several layers of
mutual calling between library and filesystem, then going through a
complex wrapping of the bio transfer, which ends with the bio endio
calling a buffer-specific endio, which calls a user-supplied buffer
endio function. That twisty path is simply unnecessary, as can be seen
now that we accomplish the same thing by a fundamentally simpler means.
Here is the rest of the support, which is 64 lines including a nice
little library for doing synchronous or asynchronous bio transfers that
will no doubt be useful for some other special purpose tasks such as
writing out log blocks (vecio) or reading the superblock (syncio). The
test() unit test below is called from the filesystem's fill_super.
typedef int (blockio_t)(struct buffer_head *buffer, int write);
typedef sector_t block_t;
struct buffer_head *page_buffer(struct page *page, unsigned which)
{
struct buffer_head *buffer = page_buffers(page);
while (which--)
buffer = buffer->b_this_page;
return buffer;
}
struct buffer_head *blockget(struct address_space *mapping, block_t block)
{
unsigned blockbits = mapping->host->i_blkbits;
unsigned subshift = PAGE_CACHE_SHIFT - blockbits;
struct page *page = grab_cache_page(mapping, block >> subshift);
struct buffer_head *buffer;
if (!page)
return ERR_PTR(-ENOMEM);
if (!page_has_buffers(page))
create_empty_buffers(page, 1 << blockbits, 0);
buffer = page_buffer(page, block & ~(-1 << subshift));
get_bh(buffer);
unlock_page(page);
page_cache_release(page);
return buffer;
}
static int vecio(int rw, struct block_device *dev, sector_t sector,
bio_end_io_t endio, void *data, unsigned vecs, struct bio_vec *vec)
{
struct bio *bio = bio_alloc(GFP_KERNEL, vecs);
if (!bio)
return -ENOMEM;
bio->bi_bdev = dev;
bio->bi_sector = sector;
bio->bi_end_io = endio;
bio->bi_private = data;
while (vecs--) {
bio->bi_io_vec[bio->bi_vcnt] = *vec++;
bio->bi_size += bio->bi_io_vec[bio->bi_vcnt++].bv_len;
}
submit_bio(rw, bio);
return 0;
}
struct biosync { wait_queue_head_t wait; int done, err; };
static void biosync_endio(struct bio *bio, int err)
{
struct biosync *sync = bio->bi_private;
bio_put(bio);
sync->err = err;
sync->done = 1;
wake_up(&sync->wait);
}
static int syncio(int rw, struct block_device *dev, sector_t sector, unsigned vecs, struct bio_vec *vec)
{
struct biosync sync = { .wait = __WAIT_QUEUE_HEAD_INITIALIZER(sync.wait) };
if (!(sync.err = vecio(rw, dev, sector, biosync_endio, &sync, vecs, vec)))
wait_event(sync.wait, sync.done);
return sync.err;
}
void hexdump(void *data, unsigned size)
{
while (size) {
unsigned char *p;
int w = 16, n = size < w? size: w, pad = w - n;
printk("%p: ", data);
for (p = data; p < (unsigned char *)data + n;)
printk("%02hx ", *p++);
printk("%*.s \"", pad*3, "");
for (p = data; p < (unsigned char *)data + n;) {
int c = *p++;
printk("%c", c < ' ' || c > 127 ? '.' : c);
}
printk("\"\n");
data += w;
size -= n;
}
}
static int test(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
inode->i_private = dev_blockio;
inode->i_blkbits = 10;
struct buffer_head *buffer = blockread(inode->i_mapping, 1);
hexdump(buffer->b_data, 0x400);
iput(inode);
return 0;
}
_______________________________________________
Tux3 mailing list
Tux3 at tux3.org
http://mailman.tux3.org/cgi-bin/mailman/listinfo/tux3
More information about the Tux3
mailing list