Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F3151994
ddsnap.c
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
74 KB
Referenced Files
None
Subscribers
None
ddsnap.c
View Options
/*
* Clustered Snapshot Metadata Server
*
* Daniel Phillips, Nov 2003 to May 2004
* (c) 2003 Sistina Software Inc.
* (c) 2004 Red Hat Software Inc.
*
*/
#define _GNU_SOURCE /* Berserk glibc headers: O_DIRECT not defined unless _GNU_SOURCE defined */
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <time.h>
#include <signal.h>
#include <sys/poll.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/ioctl.h>
#include <netinet/in.h>
#include <netdb.h> // gethostbyname2_r
#include <linux/fs.h> // BLKGETSIZE
#include <popt.h>
#include "list.h"
#include "buffer.h"
#include "ddsnap.h"
#include "../dm-ddsnap.h"
#include "trace.h"
#define trace trace_off
#define jtrace trace_off
#define BUSHY
/*
Todo:
BTree
* coalesce leafs/nodes for delete
- B*Tree splitting
Allocation bitmaps
- allocation statistics
- Per-snapshot free space as full-tree pass
- option to track specific snapshot(s) on the fly
- return stats to client (on demand? always?)
* Bitmap block radix tree - resizing
- allocation policy
Journal
\ allocation
\ write commit block
\ write target blocks
\ recovery
- stats, misc data in commit block?
File backing
\ double linked list ops
- buffer lru
- buffer writeout policy
- buffer eviction policy
- verify no busy buffers between operations
Snapshot vs origin locking
- anti-starvation measures
Message handling
- send reply on async write completion
- build up immediate reply in separate buffer
Snapshot handling path
- background copyout thread
- try AIO
- coalesce leaves/nodes on delete
- should wait for current queries on snap to complete
- background deletion optimization
- record current deletion list in superblock
Multithreading
- separate thread for copyouts
- separate thread for buffer flushing
- separate thread for new connections (?)
Utilities
- don't include anything not needed for create
- snapshot store integrity check (snapcheck)
Failover
\ Mark superblock active/inactive
+ upload client locks on server restart
+ release snapshot read locks for dead client
- Examine entire tree to initialize statistics after unsaved halt
General
\ Prevent multiple server starts on same snapshot store
+ More configurable tracing
- Add more internal consistency checks
- Magic number + version for superblock
- Flesh out and audit error paths
- Make it endian-neutral
- Verify wordsize neutral
- Add an on-the-fly verify path
+ strip out the unit testing gunk
+ More documentation
- Audits and more audits
- Memory inversion prevention
- failed chunk alloc fails operation, no abort
Cluster integration
+ Restart/Error recovery/reporting
*/
/*
* Miscellaneous Primitives
*/
typedef int fd_t;
/*
* Ripped from libiddev. It's not quite ugly enough to convince me to
* add a new dependency on a library that nobody has yet, but it's close.
*/
static int fd_size(int fd, u64 *bytes)
{
struct stat stat;
int error;
if ((error = fstat(fd, &stat)))
return error;
if (S_ISREG(stat.st_mode)) {
*bytes = stat.st_size;
return 0;
}
if ((error = ioctl(fd, BLKGETSIZE64, bytes))) {
unsigned sectors;
if ((error = ioctl(fd, BLKGETSIZE, §ors)))
return error;
*bytes = ((u64)sectors) << 9;
}
return 0;
}
void hexdump(void *data, unsigned length)
{
while (length ) {
int row = length < 16? length: 16;
printf("%p: ", data);
length -= row;
while (row--)
printf("%02hhx ", *(unsigned char *)data++);
printf("\n");
}
}
/* BTree Operations */
/* Directory at the base of the leaf block */
#define MAX_SNAPSHOTS 64
struct enode
{
u32 count;
u32 unused;
struct index_entry
{
u64 key; // note: entries[0].key never accessed
sector_t sector; // node sector address goes here
} entries[];
};
struct eleaf
{
le_u16 magic;
le_u16 version;
le_u32 count;
le_u64 base_chunk;
le_u64 using_mask;
struct etree_map
{
le_u32 offset;
le_u32 rchunk;
}
map[];
};
static inline struct enode *buffer2node(struct buffer *buffer)
{
return (struct enode *)buffer->data;
}
static inline struct eleaf *buffer2leaf(struct buffer *buffer)
{
return (struct eleaf *)buffer->data;
}
/* On-disk Format */
struct exception
{
le_u64 share;
le_u64 chunk;
};
static inline struct exception *emap(struct eleaf *leaf, unsigned i)
{
return (struct exception *)((char *) leaf + leaf->map[i].offset);
}
struct superblock
{
/* Persistent, saved to disk */
struct disksuper
{
char magic[8];
u64 create_time;
sector_t etree_root;
sector_t bitmap_base;
sector_t chunks, freechunks;
sector_t orgchunks;
chunk_t last_alloc;
u64 flags;
u32 blocksize_bits, chunksize_bits;
u64 deleting;
struct snapshot
{
u32 ctime; // relative to middle 32 bits of super ctime
u32 tag; // external name of snapshot
u8 bit; // internal snapshot number
s8 prio; // 0=normal, 127=highest, -128=lowestdrop
char reserved[7];
} snaplist[MAX_SNAPSHOTS];
u32 snapshots;
u32 etree_levels;
u32 bitmap_blocks;
s32 journal_base, journal_next, journal_size;
u32 sequence;
} image;
/* Derived, not saved to disk */
u64 snapmask;
u32 blocksize, chunksize, blocks_per_node;
u32 sectors_per_block_bits, sectors_per_block;
u32 sectors_per_chunk_bits, sectors_per_chunk;
unsigned flags;
unsigned snapdev, orgdev;
unsigned snaplock_hash_bits;
struct snaplock **snaplocks;
unsigned copybuf_size;
char *copybuf;
chunk_t source_chunk;
chunk_t dest_exception;
unsigned copy_chunks;
unsigned max_commit_blocks;
};
#define SB_BUSY 1
#define SB_MAGIC "snapshot"
/* Journal handling */
#define JMAGIC "MAGICNUM"
struct commit_block
{
char magic[8];
u32 checksum;
s32 sequence;
u32 entries;
u64 sector[];
} PACKED;
static sector_t journal_sector(struct superblock *sb, unsigned i)
{
return sb->image.journal_base + (i << sb->sectors_per_block_bits);
}
static inline struct commit_block *buf2block(struct buffer *buf)
{
return (void *)buf->data;
}
unsigned next_journal_block(struct superblock *sb)
{
unsigned next = sb->image.journal_next;
if (++sb->image.journal_next == sb->image.journal_size)
sb->image.journal_next = 0;
return next;
}
static int is_commit_block(struct commit_block *block)
{
return !memcmp(&block->magic, JMAGIC, sizeof(block->magic));
}
static u32 checksum_block(struct superblock *sb, u32 *data)
{
int i, sum = 0;
for (i = 0; i < sb->image.blocksize_bits >> 2; i++)
sum += data[i];
return sum;
}
static struct buffer *jgetblk(struct superblock *sb, unsigned i)
{
return getblk(sb->snapdev, journal_sector(sb, i), sb->blocksize);
}
static struct buffer *jread(struct superblock *sb, unsigned i)
{
return bread(sb->snapdev, journal_sector(sb, i), sb->blocksize);
}
/*
* For now there is only ever one open transaction in the journal, the newest
* one, so we don't have to check for journal wrap, but just ensure that each
* transaction stays small enough to fit in the journal.
*
* Since we don't have any asynchronous IO at the moment, journal commit is
* straightforward: walk through the dirty blocks once, writing them to the
* journal, then again, adding sector locations to the commit block. We know
* the dirty list didn't change between the two passes. When ansynchronous
* IO arrives here, this all has to be handled a lot more carefully.
*/
static void commit_transaction(struct superblock *sb)
{
// flush_buffers();
// return;
if (list_empty(&dirty_buffers))
return;
struct list_head *list;
list_for_each(list, &dirty_buffers) {
struct buffer *buffer = list_entry(list, struct buffer, list);
unsigned pos = next_journal_block(sb);
jtrace(warn("journal data sector = %Lx [%u]", buffer->sector, pos);)
assert(buffer_dirty(buffer));
write_buffer_to(buffer, journal_sector(sb, pos));
}
unsigned pos = next_journal_block(sb);
struct buffer *commit_buffer = jgetblk(sb, pos);
struct commit_block *commit = buf2block(commit_buffer);
*commit = (struct commit_block){ .magic = JMAGIC, .sequence = sb->image.sequence++ };
while (!list_empty(&dirty_buffers)) {
struct list_head *entry = dirty_buffers.next;
struct buffer *buffer = list_entry(entry, struct buffer, list);
jtrace(warn("write data sector = %Lx", buffer->sector);)
assert(buffer_dirty(buffer));
assert(commit->entries < sb->max_commit_blocks);
commit->sector[commit->entries++] = buffer->sector;
write_buffer(buffer); // deletes it from dirty (fixme: fragile)
// we hope the order we just listed these is the same as committed above
}
jtrace(warn("commit journal block [%u]", pos);)
commit->checksum = 0;
commit->checksum = -checksum_block(sb, (void *)commit);
write_buffer_to(commit_buffer, journal_sector(sb, pos));
brelse(commit_buffer);
}
int recover_journal(struct superblock *sb)
{
struct buffer *buffer;
typeof(((struct commit_block *)NULL)->sequence) sequence;
int scribbled = -1, last_block = -1, newest_block = -1;
int data_from_start = 0, data_from_last = 0;
int size = sb->image.journal_size;
char *why = "";
unsigned i;
/* Scan full journal, find newest commit */
for (i = 0; i < size; brelse(buffer), i++) {
buffer = jread(sb, i);
struct commit_block *block = buf2block(buffer);
if (!is_commit_block(block)) {
jtrace(warn("[%i] <data>", i);)
if (sequence == -1)
data_from_start++;
else
data_from_last++;
continue;
}
if (checksum_block(sb, (void *)block)) {
warn("block %i failed checksum", i);
hexdump(block, 40);
if (scribbled != -1) {
why = "Too many scribbled blocks in journal";
goto failed;
}
if (newest_block != -1 && newest_block != last_block) {
why = "Bad block not last written";
goto failed;
}
scribbled = i;
if (last_block != -1)
newest_block = last_block;
sequence++;
continue;
}
jtrace(warn("[%i] seq=%i", i, block->sequence);)
if (last_block != -1 && block->sequence != sequence + 1) {
int delta = sequence - block->sequence;
if (delta <= 0 || delta > size) {
why = "Bad sequence";
goto failed;
}
if (newest_block != -1) {
why = "Multiple sequence wraps";
goto failed;
}
if (!(scribbled == -1 || scribbled == i - 1)) {
why = "Bad block not last written";
goto failed;
}
newest_block = last_block;
}
data_from_last = 0;
last_block = i;
sequence = block->sequence;
}
if (last_block == -1) {
why = "No commit blocks found";
goto failed;
}
if (newest_block == -1) {
/* test for all the legal scribble positions here */
newest_block = last_block;
}
jtrace(warn("found newest commit [%u]", newest_block);)
buffer = jread(sb, newest_block);
struct commit_block *commit = buf2block(buffer);
unsigned entries = commit->entries;
for (i = 0; i < entries; i++) {
unsigned pos = (newest_block - entries + i + size) % size;
struct buffer *databuf = jread(sb, pos);
struct commit_block *block = buf2block(databuf);
if (is_commit_block(block)) {
error("data block [%u] marked as commit block", pos);
continue;
}
jtrace(warn("write journal [%u] data to %Lx", pos, commit->sector[i]);)
write_buffer_to(databuf, commit->sector[i]);
brelse(databuf);
}
sb->image.journal_next = (newest_block + 1 + size) % size;
sb->image.sequence = commit->sequence + 1;
brelse(buffer);
return 0;
failed:
errno = EIO; /* return a misleading error (be part of the problem) */
error("Journal recovery failed, %s", why);
return -1;
}
static void _show_journal(struct superblock *sb)
{
int i, j;
for (i = 0; i < sb->image.journal_size; i++) {
struct buffer *buf = jread(sb, i);
struct commit_block *block = buf2block(buf);
if (!is_commit_block(block)) {
printf("[%i] <data>\n", i);
continue;
}
printf("[%i] seq=%i (%i)", i, block->sequence, block->entries);
for (j = 0; j < block->entries; j++)
printf(" %Lx", (long long)block->sector[j]);
printf("\n");
brelse(buf);
}
printf("\n");
}
#define show_journal(sb) do { warn("Journal..."); _show_journal(sb); } while (0)
/* BTree leaf operations */
/*
* We operate directly on the BTree leaf blocks to insert exceptions and
* to enquire the sharing status of given chunks. This means all the data
* items in the block need to be properly aligned for architecture
* independence. To save space and to permit binary search a directory
* map at the beginning of the block points at the exceptions stored
* at the top of the block. The difference between two successive directory
* pointers gives the number of distinct exceptions for a given chunk.
* Each exception is paired with a bitmap that specifies which snapshots
* the exception belongs to.
*
* The chunk addresses in the leaf block directory are relative to a base
* chunk to save space. These are currently 32 bit values but may become
* 16 bits values. Since each is paired with a pointer into the list of
* exceptions, 16 bit emap entries would limit the blocksize to 64K.
*
* A mask in the leaf block header specifies which snapshots are actually
* encoded in the chunk. This allows lazy deletion (almost, needs fixing)
*
* The leaf operations need to know the size of the block somehow.
* Currently that is accomplished by inserting the block size as a sentinel
* in the block directory map; this may change.
*
* When an exception is created by a write to the origin it is initially
* shared by all snapshots that don't already have exceptions. Snapshot
* writes may later unshare some of these exceptions.
*/
/*
* To do:
* - Check leaf, index structure
* - Mechanism for identifying which snapshots are in each leaf
* - binsearch for leaf, index lookup
* - enforce 32 bit address range within leaf
*/
struct buffer *snapread(struct superblock *sb, sector_t sector)
{
return bread(sb->snapdev, sector, sb->blocksize);
}
void show_leaf(struct eleaf *leaf)
{
struct exception *p;
int i;
printf("%i chunks: ", leaf->count);
for (i = 0; i < leaf->count; i++) {
printf("%x=", leaf->map[i].rchunk);
// printf("@%i ", leaf->map[i].offset);
for (p = emap(leaf, i); p < emap(leaf, i+1); p++)
printf("%Lx/%08llx%s", p->chunk, p->share, p+1 < emap(leaf, i+1)? ",": " ");
}
// printf("top@%i", leaf->map[i].offset);
printf("\n");
}
/*
* origin_chunk_unique: an origin logical chunk is shared unless all snapshots
* have exceptions.
*/
int origin_chunk_unique(struct eleaf *leaf, u64 chunk, u64 snapmask)
{
u64 using = 0;
unsigned i, target = chunk - leaf->base_chunk;
struct exception *p;
for (i = 0; i < leaf->count; i++)
if (leaf->map[i].rchunk == target)
goto found;
return !snapmask;
found:
for (p = emap(leaf, i); p < emap(leaf, i+1); p++)
using |= p->share;
return !(~using & snapmask);
}
/*
* snapshot_chunk_unique: a snapshot logical chunk is shared if it has no
* exception or has the same exception as another snapshot. In any case
* if the chunk has an exception we need to know the exception address.
*/
int snapshot_chunk_unique(struct eleaf *leaf, u64 chunk, int snapshot, u64 *exception)
{
u64 mask = 1LL << snapshot;
unsigned i, target = chunk - leaf->base_chunk;
struct exception *p;
for (i = 0; i < leaf->count; i++)
if (leaf->map[i].rchunk == target)
goto found;
return 0;
found:
for (p = emap(leaf, i); p < emap(leaf, i+1); p++)
/* shared if more than one bit set including this one */
if ((p->share & mask)) {
*exception = p->chunk;
// printf("unique %Lx %Lx\n", p->share, mask);
return !(p->share & ~mask);
}
return 0;
}
/*
* add_exception_to_leaf:
* - cycle through map to find existing logical chunk or insertion point
* - if not found need to add new chunk address
* - move tail of map up
* - store new chunk address in map
* - otherwise
* - for origin:
* - or together all sharemaps, invert -> new map
* - for snapshot:
* - clear out bit for existing exception
* - if sharemap zero warn and reuse this location
* - insert new exception
* - move head of exceptions down
* - store new exception/sharemap
* - adjust map head offsets
*
* If the new exception won't fit in the leaf, return an error so that
* higher level code may split the leaf and try again. This keeps the
* leaf-editing code complexity down to a dull roar.
*/
unsigned leaf_freespace(struct eleaf *leaf)
{
char *maptop = (char *)(&leaf->map[leaf->count + 1]); // include sentinel
return (char *)emap(leaf, 0) - maptop;
}
unsigned leaf_payload(struct eleaf *leaf)
{
int lower = (char *)(&leaf->map[leaf->count]) - (char *)leaf->map;
int upper = (char *)emap(leaf, leaf->count) - (char *)emap(leaf, 0);
return lower + upper;
}
int add_exception_to_leaf(struct eleaf *leaf, u64 chunk, u64 exception, int snapshot, u64 active)
{
unsigned i, j, target = chunk - leaf->base_chunk;
u64 mask = 1ULL << snapshot, sharemap;
struct exception *ins, *exceptions = emap(leaf, 0);
char *maptop = (char *)(&leaf->map[leaf->count + 1]); // include sentinel
int free = (char *)exceptions - maptop
#ifdef BUSHY
- 10
#endif
;
trace(warn("chunk %Lx exception %Lx, snapshot = %i", chunk, exception, snapshot);)
for (i = 0; i < leaf->count; i++) // !!! binsearch goes here
if (leaf->map[i].rchunk >= target)
break;
if (i == leaf->count || leaf->map[i].rchunk > target) {
if (free < sizeof(struct exception) + sizeof(struct etree_map))
return -EFULL;
ins = emap(leaf, i);
memmove(&leaf->map[i+1], &leaf->map[i], maptop - (char *)&leaf->map[i]);
leaf->map[i].offset = (char *)ins - (char *)leaf;
leaf->map[i].rchunk = target;
leaf->count++;
sharemap = snapshot == -1? active: mask;
goto insert;
}
if (free < sizeof(struct exception))
return -EFULL;
if (snapshot == -1) {
for (sharemap = 0, ins = emap(leaf, i); ins < emap(leaf, i+1); ins++)
sharemap |= ins->share;
sharemap = (~sharemap) & active;
} else {
for (ins = emap(leaf, i); ins < emap(leaf, i+1); ins++)
if ((ins->share & mask)) {
ins->share &= ~mask;
break;
}
sharemap = mask;
}
ins = emap(leaf, i);
insert:
memmove(exceptions - 1, exceptions, (char *)ins - (char *)exceptions);
ins--;
ins->share = sharemap;
ins->chunk = exception;
for (j = 0; j <= i; j++)
leaf->map[j].offset -= sizeof(struct exception);
return 0;
}
/*
* split_leaf: Split one leaf into two approximately in the middle. Copy
* the upper half of entries to the new leaf and move the lower half of
* entries to the top of the original block.
*/
u64 split_leaf(struct eleaf *leaf, struct eleaf *leaf2)
{
unsigned i, nhead = (leaf->count + 1) / 2, ntail = leaf->count - nhead, tailsize;
/* Should split at middle of data instead of median exception */
u64 splitpoint = leaf->map[nhead].rchunk + leaf->base_chunk;
char *phead, *ptail;
phead = (char *)emap(leaf, 0);
ptail = (char *)emap(leaf, nhead);
tailsize = (char *)emap(leaf, leaf->count) - ptail;
/* Copy upper half to new leaf */
memcpy(leaf2, leaf, offsetof(struct eleaf, map)); // header
memcpy(&leaf2->map[0], &leaf->map[nhead], (ntail + 1) * sizeof(struct etree_map)); // map
memcpy(ptail - (char *)leaf + (char *)leaf2, ptail, tailsize); // data
leaf2->count = ntail;
/* Move lower half to top of block */
memmove(phead + tailsize, phead, ptail - phead);
leaf->count = nhead;
for (i = 0; i <= nhead; i++) // also adjust sentinel
leaf->map[i].offset += tailsize;
leaf->map[nhead].rchunk = 0; // tidy up
return splitpoint;
}
void merge_leaves(struct eleaf *leaf, struct eleaf *leaf2)
{
unsigned nhead = leaf->count, ntail = leaf2->count, i;
unsigned tailsize = (char *)emap(leaf2, ntail) - (char *)emap(leaf2, 0);
char *phead = (char *)emap(leaf, 0), *ptail = (char *)emap(leaf, nhead);
// adjust pointers
for (i = 0; i <= nhead; i++) // also adjust sentinel
leaf->map[i].offset -= tailsize;
// move data down
phead = (char *)emap(leaf, 0);
ptail = (char *)emap(leaf, nhead);
memmove(phead, phead + tailsize, ptail - phead);
// move data from leaf2 to top
memcpy(ptail, (char *)emap(leaf2, 0), tailsize); // data
memcpy(&leaf->map[nhead], &leaf2->map[0], (ntail + 1) * sizeof(struct etree_map)); // map
leaf->count += ntail;
}
void merge_nodes(struct enode *node, struct enode *node2)
{
memcpy(&node->entries[node->count], &node2->entries[0], node2->count * sizeof(struct index_entry));
node->count += node2->count;
}
void init_leaf(struct eleaf *leaf, int block_size)
{
leaf->magic = 0x1eaf;
leaf->version = 0;
leaf->base_chunk = 0;
leaf->count = 0;
leaf->map[0].offset = block_size;
#ifdef BUSHY
leaf->map[0].offset = 200;
#endif
}
/*
* Chunk allocation via bitmaps
*/
#define SB_DIRTY 1
#define SB_SECTOR 8
#define SB_SIZE 4096
void set_sb_dirty(struct superblock *sb)
{
sb->flags |= SB_DIRTY;
}
static inline int get_bitmap_bit(unsigned char *bitmap, unsigned bit)
{
return bitmap[bit >> 3] & (1 << (bit & 7));
}
static inline void set_bitmap_bit(unsigned char *bitmap, unsigned bit)
{
bitmap[bit >> 3] |= 1 << (bit & 7);
}
static inline void clear_bitmap_bit(unsigned char *bitmap, unsigned bit)
{
bitmap[bit >> 3] &= ~(1 << (bit & 7));
}
static unsigned calc_bitmap_blocks(struct superblock *sb, u64 chunks)
{
unsigned chunkshift = sb->image.chunksize_bits;
return (chunks + (1 << (chunkshift + 3)) - 1) >> (chunkshift + 3);
}
static void init_allocation(struct superblock *sb)
{
u64 chunks = sb->image.chunks;
unsigned bitmaps = calc_bitmap_blocks(sb, chunks);
unsigned bitmap_base_chunk = (SB_SECTOR + sb->sectors_per_block + sb->sectors_per_chunk - 1) >> sb->sectors_per_chunk_bits;
unsigned bitmap_chunks = sb->image.bitmap_blocks = bitmaps; // !!! chunksize same as blocksize
unsigned reserved = bitmap_base_chunk + bitmap_chunks + sb->image.journal_size; // !!! chunksize same as blocksize
unsigned sector = sb->image.bitmap_base = bitmap_base_chunk << sb->sectors_per_chunk_bits;
warn("snapshot store size: %Lu chunks (%Lu sectors)", chunks, chunks << sb->sectors_per_chunk_bits);
printf("Initializing %u bitmap blocks... ", bitmaps);
unsigned i;
for (i = 0; i < bitmaps; i++, sector += sb->sectors_per_block) {
struct buffer *buffer = getblk(sb->snapdev, sector, sb->blocksize);
printf("%Lx ", buffer->sector);
memset(buffer->data, 0, sb->blocksize);
/* Reserve bitmaps and superblock */
if (i == 0) {
unsigned i;
for (i = 0; i < reserved; i++)
set_bitmap_bit(buffer->data, i);
}
/* Suppress overrun allocation in partial last byte */
if (i == bitmaps - 1 && (chunks & 7))
buffer->data[(chunks >> 3) & (sb->blocksize - 1)] |= 0xff << (chunks & 7);
trace_off(dump_buffer(buffer, 0, 16);)
brelse_dirty(buffer);
}
printf("\n");
sb->image.freechunks = chunks - reserved;
sb->image.last_alloc = 0;
sb->image.journal_base = (bitmap_base_chunk + bitmap_chunks) << sb->sectors_per_chunk_bits;
}
static void free_chunk(struct superblock *sb, chunk_t chunk)
{
unsigned bitmap_shift = sb->image.blocksize_bits + 3, bitmap_mask = (1 << bitmap_shift ) - 1;
u64 bitmap_block = chunk >> bitmap_shift;
trace(printf("free chunk %Lx\n", chunk);)
struct buffer *buffer = snapread(sb, sb->image.bitmap_base + (bitmap_block << sb->sectors_per_block_bits));
if (!get_bitmap_bit(buffer->data, chunk & bitmap_mask)) {
warn("chunk %Lx already free!", (long long)chunk);
brelse(buffer);
return;
}
clear_bitmap_bit(buffer->data, chunk & bitmap_mask);
brelse_dirty(buffer);
sb->image.freechunks++;
set_sb_dirty(sb); // !!! optimize this away
}
static inline void free_block(struct superblock *sb, sector_t address)
{
free_chunk(sb, address >> sb->sectors_per_chunk_bits); // !!! assumes blocksize = chunksize
}
void grab_chunk(struct superblock *sb, chunk_t chunk) // just for testing
{
unsigned bitmap_shift = sb->image.blocksize_bits + 3, bitmap_mask = (1 << bitmap_shift ) - 1;
u64 bitmap_block = chunk >> bitmap_shift;
struct buffer *buffer = snapread(sb, sb->image.bitmap_base + (bitmap_block << sb->sectors_per_block_bits));
assert(!get_bitmap_bit(buffer->data, chunk & bitmap_mask));
set_bitmap_bit(buffer->data, chunk & bitmap_mask);
brelse_dirty(buffer);
}
chunk_t alloc_chunk_range(struct superblock *sb, chunk_t chunk, chunk_t range)
{
unsigned bitmap_shift = sb->image.blocksize_bits + 3, bitmap_mask = (1 << bitmap_shift ) - 1;
u64 blocknum = chunk >> bitmap_shift;
unsigned bit = chunk & 7, offset = (chunk & bitmap_mask) >> 3;
u64 length = (range + bit + 7) >> 3;
while (1) {
struct buffer *buffer = snapread(sb, sb->image.bitmap_base + (blocknum << sb->sectors_per_block_bits));
unsigned char c, *p = buffer->data + offset;
unsigned tail = sb->blocksize - offset, n = tail > length? length: tail;
trace_off(printf("search %u bytes of bitmap %Lx from offset %u\n", n, blocknum, offset);)
// dump_buffer(buffer, 4086, 10);
for (length -= n; n--; p++)
if ((c = *p) != 0xff) {
int i, bit;
trace_off(printf("found byte at offset %u of bitmap %Lx = %hhx\n", p - buffer->data, blocknum, c);)
for (i = 0, bit = 1;; i++, bit <<= 1)
if (!(c & bit)) {
chunk = i + ((p - buffer->data) << 3) + (blocknum << bitmap_shift);
assert(!get_bitmap_bit(buffer->data, chunk & bitmap_mask));
set_bitmap_bit(buffer->data, chunk & bitmap_mask);
brelse_dirty(buffer);
sb->image.freechunks--;
set_sb_dirty(sb); // !!! optimize this away
return chunk;
}
}
brelse(buffer);
if (!length)
return 0;
if (++blocknum == sb->image.bitmap_blocks)
blocknum = 0;
offset = 0;
trace_off(printf("go to bitmap %Lx\n", blocknum);)
}
}
static int delete_snapshot(struct superblock *sb, unsigned tag);
// !!! possible/practical to topsort away this forward ref?
static chunk_t alloc_chunk(struct superblock *sb)
{
chunk_t last = sb->image.last_alloc, total = sb->image.chunks, found;
do {
if ((found = alloc_chunk_range(sb, last, total - last)) ||
(found = alloc_chunk_range(sb, 0, last))) {
sb->image.last_alloc = found;
set_sb_dirty(sb); // !!! optimize this away
return (found);
}
int tag = sb->image.snaplist[0].tag;
warn("snapshot store full, release snapshot %i", tag);
if (delete_snapshot(sb, tag))
error("snapshot delete failed");
} while (sb->image.snapshots);
error("snapshot store full");
return -1;
}
/* Snapshot Store Allocation */
static sector_t alloc_block(struct superblock *sb)
{
return alloc_chunk(sb) << sb->sectors_per_chunk_bits; // !!! assume blocksize = chunksize
}
static u64 alloc_exception(struct superblock *sb)
{
return alloc_chunk(sb);
}
static struct buffer *new_block(struct superblock *sb)
{
// !!! handle alloc_block failure
return getblk(sb->snapdev, alloc_block(sb), sb->blocksize);
}
static struct buffer *new_leaf(struct superblock *sb)
{
trace(printf("New leaf\n");)
struct buffer *buffer = new_block(sb);
init_leaf(buffer2leaf(buffer), sb->blocksize);
set_buffer_dirty(buffer);
return buffer;
}
static struct buffer *new_node(struct superblock *sb)
{
trace(printf("New node\n");)
struct buffer *buffer = new_block(sb);
struct enode *node = buffer2node(buffer);
node->count = 0;
set_buffer_dirty(buffer);
return buffer;
}
/* BTree debug dump */
static void show_subtree(struct superblock *sb, struct enode *node, int levels, int indent)
{
int i;
printf("%*s", indent, "");
printf("%i nodes:\n", node->count);
for (i = 0; i < node->count; i++) {
struct buffer *buffer = snapread(sb, node->entries[i].sector);
if (i)
printf("pivot = %Lx\n", (long long)node->entries[i].key);
if (levels)
show_subtree(sb, buffer2node(buffer), levels - 1, indent + 3);
else {
printf("%*s", indent + 3, "");
show_leaf(buffer2leaf(buffer));
}
brelse(buffer);
}
}
static void show_tree(struct superblock *sb)
{
struct buffer *buffer = snapread(sb, sb->image.etree_root);
show_subtree(sb, buffer2node(buffer), sb->image.etree_levels - 1, 0);
brelse(buffer);
}
/* High Level BTree Editing */
/*
* BTree insertion is a little hairy, as expected. We keep track of the
* access path in a vector of etree_path elements, each of which holds
* a node buffer and a pointer into the buffer data giving the address at
* which the next buffer in the path was found, which is also where a new
* node will be inserted if necessary. If a leaf is split we may need to
* work all the way up from the bottom to the top of the path, splitting
* index nodes as well. If we split the top index node we need to add
* a new tree level. We have to keep track of which nodes were modified
* and keep track of refcounts of all buffers involved, which can be quite
* a few.
*
* Note that the first key of an index block is never accessed. This is
* because for a btree, there is always one more key than nodes in each
* index node. In other words, keys lie between node pointers. We will
* micro-optimize by placing the node count in the first key, which allows
* a node to contain an esthetically pleasing binary number of pointers.
* (Not done yet.)
*/
#define MAX_ETREE_DEPTH 6
struct etree_path { struct buffer *buffer; struct index_entry *pnext; };
static struct buffer *probe(struct superblock *sb, u64 chunk, struct etree_path *path)
{
unsigned i, levels = sb->image.etree_levels;
struct buffer *nodebuf = snapread(sb, sb->image.etree_root);
struct enode *node = buffer2node(nodebuf);
for (i = 0; i < levels; i++) {
struct index_entry *pnext = node->entries, *top = pnext + node->count;
while (++pnext < top)
if (pnext->key > chunk)
break;
path[i].buffer = nodebuf;
path[i].pnext = pnext;
nodebuf = snapread(sb, (pnext - 1)->sector);
node = (struct enode *)nodebuf->data;
}
assert(((struct eleaf *)nodebuf->data)->magic == 0x1eaf);
return nodebuf;
}
static void brelse_path(struct etree_path *path, unsigned levels)
{
unsigned i;
for (i = 0; i < levels; i++)
brelse(path[i].buffer);
}
static void show_tree_range(struct superblock *sb, chunk_t start, unsigned leaves)
{
int levels = sb->image.etree_levels, level = -1;
struct etree_path path[levels];
struct buffer *nodebuf;
struct enode *node;
struct buffer *leafbuf;
#if 1
leafbuf = probe(sb, start, path);
level = levels - 1;
nodebuf = path[level].buffer;
node = buffer2node(nodebuf);
goto start;
#endif
while (1) {
do {
level++;
nodebuf = snapread(sb, level? path[level - 1].pnext++->sector: sb->image.etree_root);
node = buffer2node(nodebuf);
path[level].buffer = nodebuf;
path[level].pnext = node->entries;
trace(printf("push to level %i, %i nodes\n", level, node->count);)
} while (level < levels - 1);
trace(printf("do %i leaf nodes level = %i\n", node->count, level);)
while (path[level].pnext < node->entries + node->count) {
leafbuf = snapread(sb, path[level].pnext++->sector);
start: show_leaf(buffer2leaf(leafbuf));
brelse(leafbuf);
if (!--leaves) {
brelse_path(path, level + 1);
return;
}
}
do {
brelse(nodebuf);
if (!level)
return;
nodebuf = path[--level].buffer;
node = buffer2node(nodebuf);
trace(printf("pop to level %i, %i of %i nodes\n", level, path[level].pnext - node->entries, node->count);)
} while (path[level].pnext == node->entries + node->count);
};
}
static void insert_child(struct enode *node, struct index_entry *p, sector_t child, u64 childkey)
{
memmove(p + 1, p, (char *)(&node->entries[0] + node->count) - (char *)p);
p->sector = child;
p->key = childkey;
node->count++;
}
static void add_exception_to_tree(struct superblock *sb, struct buffer *leafbuf, u64 target, u64 exception, int snapnum, struct etree_path path[], unsigned levels)
{
if (!add_exception_to_leaf(buffer2leaf(leafbuf), target, exception, snapnum, sb->snapmask)) {
brelse_dirty(leafbuf);
return;
}
trace(warn("add leaf");)
struct buffer *childbuf = new_leaf(sb);
u64 childkey = split_leaf(buffer2leaf(leafbuf), buffer2leaf(childbuf));
sector_t childsector = childbuf->sector;
if (add_exception_to_leaf(target < childkey? buffer2leaf(leafbuf): buffer2leaf(childbuf), target, exception, snapnum, sb->snapmask))
error("can't happen");
brelse_dirty(leafbuf);
brelse_dirty(childbuf);
while (levels--) {
struct index_entry *pnext = path[levels].pnext;
struct buffer *parentbuf = path[levels].buffer;
struct enode *parent = buffer2node(parentbuf);
if (parent->count < sb->blocks_per_node) {
insert_child(parent, pnext, childsector, childkey);
set_buffer_dirty(parentbuf);
return;
}
unsigned half = parent->count / 2;
u64 newkey = parent->entries[half].key;
struct buffer *newbuf = new_node(sb); // !!! handle error
struct enode *newnode = buffer2node(newbuf);
newnode->count = parent->count - half;
memcpy(&newnode->entries[0], &parent->entries[half], newnode->count * sizeof(struct index_entry));
parent->count = half;
if (pnext > &parent->entries[half]) {
pnext = pnext - &parent->entries[half] + newnode->entries;
set_buffer_dirty(parentbuf);
parentbuf = newbuf;
parent = newnode;
} else set_buffer_dirty(newbuf);
insert_child(parent, pnext, childsector, childkey);
set_buffer_dirty(parentbuf);
childkey = newkey;
childsector = newbuf->sector;
brelse(newbuf);
}
trace(printf("add tree level\n");)
struct buffer *newrootbuf = new_node(sb); // !!! handle error
struct enode *newroot = buffer2node(newrootbuf);
newroot->count = 2;
newroot->entries[0].sector = sb->image.etree_root;
newroot->entries[1].key = childkey;
newroot->entries[1].sector = childsector;
sb->image.etree_root = newrootbuf->sector;
sb->image.etree_levels++;
set_sb_dirty(sb);
brelse_dirty(newrootbuf);
}
#define chunk_highbit ((sizeof(chunk_t) * 8) - 1)
static int finish_copyout(struct superblock *sb)
{
if (sb->copy_chunks) {
int is_snap = sb->source_chunk >> chunk_highbit;
chunk_t source = sb->source_chunk & ~(1ULL << chunk_highbit);
unsigned size = sb->copy_chunks << sb->image.chunksize_bits;
trace(warn("copy %u %schunks from %Lx to %Lx", sb->copy_chunks, is_snap? "snapshot ": "", source, sb->dest_exception);)
assert(size <= sb->copybuf_size);
pread(is_snap? sb->snapdev: sb->orgdev, sb->copybuf, size, source << sb->image.chunksize_bits); // 64 bit!!!
pwrite(sb->snapdev, sb->copybuf, size, sb->dest_exception << sb->image.chunksize_bits); // 64 bit!!!
sb->copy_chunks = 0;
}
return 0;
}
static int copyout(struct superblock *sb, chunk_t chunk, chunk_t exception)
{
#if 1
if (sb->source_chunk + sb->copy_chunks == chunk &&
sb->dest_exception + sb->copy_chunks == exception &&
sb->copy_chunks < sb->copybuf_size >> sb->image.chunksize_bits) {
sb->copy_chunks++;
return 0;
}
finish_copyout(sb);
sb->copy_chunks = 1;
sb->source_chunk = chunk;
sb->dest_exception = exception;
#else
int is_snap = sb->source_chunk >> chunk_highbit;
chunk_t source = chunk & ~((1ULL << chunk_highbit) - 1);
pread(is_snap? sb->snapdev: sb->orgdev, sb->copybuf, sb->chunksize, source << sb->image.chunksize_bits); // 64 bit!!!
pwrite(sb->snapdev, sb->copybuf, sb->chunksize, exception << sb->image.chunksize_bits); // 64 bit!!!
#endif
return 0;
}
/*
* This is the bit that does all the work. It's rather arbitrarily
* factored into a probe and test part, then an exception add part,
* called only if an exception for a given chunk isn't already present
* in the Btree. This factoring will change a few more times yet as
* the code gets more asynchronous and multi-threaded.
*/
static chunk_t make_unique(struct superblock *sb, chunk_t chunk, int snapnum)
{
unsigned levels = sb->image.etree_levels;
struct etree_path path[levels + 1];
struct buffer *leafbuf = probe(sb, chunk, path);
chunk_t exception = 0;
trace(warn("chunk %Lx, snapnum %i", chunk, snapnum));
if (snapnum == -1?
origin_chunk_unique(buffer2leaf(leafbuf), chunk, sb->snapmask):
snapshot_chunk_unique(buffer2leaf(leafbuf), chunk, snapnum, &exception))
{
trace(warn("chunk %Lx already unique in snapnum %i", chunk, snapnum);)
brelse(leafbuf);
goto out;
}
u64 newex = alloc_exception(sb);
if (newex == -1) {
brelse(leafbuf); // !!! maybe add_exception_to_tree should not do this
goto out;
}
// if (snapnum == -1)
copyout(sb, exception? (exception | (1ULL << chunk_highbit)): chunk, newex);
add_exception_to_tree(sb, leafbuf, chunk, newex, snapnum, path, levels);
exception = newex;
out:
brelse_path(path, levels);
return exception;
}
static int test_unique(struct superblock *sb, chunk_t chunk, int snapnum, chunk_t *exception)
{
unsigned levels = sb->image.etree_levels;
struct etree_path path[levels + 1];
struct buffer *leafbuf = probe(sb, chunk, path);
trace(warn("chunk %Lx, snapnum %i", chunk, snapnum));
int result = snapnum == -1?
origin_chunk_unique(buffer2leaf(leafbuf), chunk, sb->snapmask):
snapshot_chunk_unique(buffer2leaf(leafbuf), chunk, snapnum, exception);
brelse(leafbuf);
brelse_path(path, levels);
return result;
}
/* Snapshot Store Superblock handling */
static u64 calc_snapmask(struct superblock *sb)
{
u64 mask = 0;
int i;
for (i = 0; i < sb->image.snapshots; i++)
mask |= 1ULL << sb->image.snaplist[i].bit;
return mask;
}
static int tag_snapnum(struct superblock *sb, unsigned tag)
{
unsigned i, n = sb->image.snapshots;
for (i = 0; i < n; i++)
if (sb->image.snaplist[i].tag == tag)
return sb->image.snaplist[i].bit;
return -1;
}
static int snapnum_tag(struct superblock *sb, unsigned bit)
{
unsigned i, n = sb->image.snapshots;
for (i = 0; i < n; i++)
if (sb->image.snaplist[i].bit == bit)
return sb->image.snaplist[i].tag;
return -1;
}
static int create_snapshot(struct superblock *sb, unsigned snaptag)
{
unsigned i, snapshots = sb->image.snapshots;
struct snapshot *snapshot;
/* Check tag not already used */
for (i = 0; i < snapshots; i++)
if (sb->image.snaplist[i].tag == snaptag)
return -1;
/* Find available snapshot bit */
for (i = 0; i < MAX_SNAPSHOTS; i++)
if (!(sb->snapmask & (1ULL << i)))
goto create;
return -EFULL;
create:
trace_on(printf("Create snapshot %i (internal %i)\n", snaptag, i);)
snapshot = sb->image.snaplist + sb->image.snapshots++;
*snapshot = (struct snapshot){ .tag = snaptag, .bit = i, .ctime = time(NULL) };
sb->snapmask |= (1ULL << i);
set_sb_dirty(sb);
return i;
};
/*
* delete_snapshot: remove all exceptions from a given snapshot from a leaf
* working from top to bottom of the exception list clearing snapshot bits
* and packing the nonzero exceptions into the top of the block. Then work
* from bottom to top in the directory map packing nonempty entries into the
* bottom of the map.
*/
static int delete_snapshots_from_leaf(struct superblock *sb, struct eleaf *leaf, u64 snapmask, unsigned max_dirty)
{
struct exception *p = emap(leaf, leaf->count), *dest = p;
struct etree_map *pmap, *dmap;
unsigned i, any = 0;
/* Scan top to bottom clearing snapshot bit and moving
* non-zero entries to top of block */
for (i = leaf->count; i--;) {
while (p != emap(leaf, i)) {
if (max_dirty >= dirty_buffer_count)
snapmask = 0;
u64 share = (--p)->share;
any |= share & snapmask;
if ((share &= ~snapmask))
*--dest = *p;
else
free_chunk(sb, p->chunk);
}
leaf->map[i].offset = (char *)dest - (char *)leaf;
}
/* Remove empties from map */
dmap = pmap = &leaf->map[0];
for (i = 0; i < leaf->count; i++, pmap++)
if (pmap->offset != (pmap + 1)->offset)
*dmap++ = *pmap;
dmap->offset = pmap->offset;
dmap->rchunk = 0; // tidy up
leaf->count = dmap - &leaf->map[0];
return !!any;
}
static void delete_snapshots_from_tree(struct superblock *sb, u64 snapmask)
{
int levels = sb->image.etree_levels, level = -1;
struct etree_path path[levels];
struct buffer *nodebuf;
struct enode *node;
trace_on(printf("delete snapshot mask %Lx\n", snapmask);)
while (1) {
do {
level++;
nodebuf = snapread(sb, level? path[level - 1].pnext++->sector: sb->image.etree_root);
node = buffer2node(nodebuf);
path[level].buffer = nodebuf;
path[level].pnext = node->entries;
trace(printf("push to level %i, %i nodes\n", level, node->count);)
} while (level < levels - 1);
trace(printf("do %i leaf nodes\n", node->count);)
while (path[level].pnext < node->entries + node->count) {
struct buffer *leafbuf = snapread(sb, path[level].pnext++->sector);
trace_off(printf("process leaf %Lx\n", leafbuf->sector);)
delete_snapshots_from_leaf(sb, buffer2leaf(leafbuf), snapmask, -1);
brelse(leafbuf);
}
do {
brelse(nodebuf);
if (!level)
return;
nodebuf = path[--level].buffer;
node = buffer2node(nodebuf);
trace(printf("pop to level %i, %i of %i nodes\n", level, path[level].pnext - node->entries, node->count);)
} while (path[level].pnext == node->entries + node->count);
};
}
/*
* Delete algorithm (flesh this out)
*
* reached the end of an index block:
* try to merge with an index block in hold[]
* if can't merge then maybe can rebalance
* if can't merge then release the block in hold[] and move this block to hold[]
* can't merge if there's no block in hold[] or can't fit two together
* if can merge
* release and free this index block and
* delete from parent:
* if parent count zero, the grantparent key is going to be deleted, updating the pivot
* otherwise parent's deleted key becomes new pivot
*/
static inline struct enode *path_node(struct etree_path path[], int level)
{
return buffer2node(path[level].buffer);
}
static inline int finished_level(struct etree_path path[], int level)
{
struct enode *node = path_node(path, level);
return path[level].pnext == node->entries + node->count;
}
static void remove_index(struct etree_path path[], int level)
{
struct enode *node = path_node(path, level);
chunk_t pivot = (path[level].pnext)->key; // !!! out of bounds for delete of last from full index
int count = node->count, i;
// stomps the node count (if 0th key holds count)
memmove(path[level].pnext - 1, path[level].pnext,
(char *)&node->entries[count] - (char *)path[level].pnext);
node->count = count - 1;
--(path[level].pnext);
set_buffer_dirty(path[level].buffer);
// no pivot for last entry
if (path[level].pnext == node->entries + node->count)
return;
// climb up to common parent and set pivot to deleted key
// what if index is now empty? (no deleted key)
// then some key above is going to be deleted and used to set pivot
if (path[level].pnext == node->entries && level) {
for (i = level - 1; path[i].pnext - 1 == path_node(path, i)->entries; i--)
if (!i)
return;
(path[i].pnext - 1)->key = pivot;
set_buffer_dirty(path[i].buffer);
}
}
static void brelse_free(struct superblock *sb, struct buffer *buffer)
{
brelse(buffer);
if (buffer->count) {
warn("free block %Lx still in use!", (long long)buffer->sector);
return;
}
free_block(sb, buffer->sector);
evict_buffer(buffer);
}
static chunk_t delete_tree_range(struct superblock *sb, u64 snapmask, chunk_t resume, unsigned max_dirty)
{
int levels = sb->image.etree_levels, level = levels - 1;
struct etree_path path[levels], hold[levels];
struct buffer *leafbuf, *prevleaf = NULL;
unsigned i;
for (i = 0; i < levels; i++) // can be initializer if not dynamic array (change it?)
hold[i] = (struct etree_path){ };
leafbuf = probe(sb, resume, path);
while (1) { /* in-order leaf walk */
trace_off(show_leaf(buffer2leaf(leafbuf));)
// should pass in and act on max_dirty...
if (delete_snapshots_from_leaf(sb, buffer2leaf(leafbuf), snapmask, max_dirty))
set_buffer_dirty(leafbuf);
if (prevleaf) { /* try to merge this leaf with prev */
struct eleaf *this = buffer2leaf(leafbuf);
struct eleaf *prev = buffer2leaf(prevleaf);
trace_off(warn("check leaf %p against %p", leafbuf, prevleaf);)
trace_off(warn("need = %i, free = %i", leaf_payload(this), leaf_freespace(prev));)
if (leaf_payload(this) <= leaf_freespace(prev)) {
trace_off(warn(">>> can merge leaf %p into leaf %p", leafbuf, prevleaf);)
merge_leaves(prev, this);
remove_index(path, level);
set_buffer_dirty(prevleaf);
brelse_free(sb, leafbuf);
goto keep_prev_leaf;
}
brelse(prevleaf);
}
prevleaf = leafbuf;
keep_prev_leaf:
if (finished_level(path, level)) {
do { /* pop and try to merge finished nodes */
if (hold[level].buffer) {
assert(level); /* root node can't have any prev */
struct enode *this = path_node(path, level);
struct enode *prev = path_node(hold, level);
trace_off(warn("check node %p against %p", this, prev);)
trace_off(warn("this count = %i prev count = %i", this->count, prev->count);)
if (this->count <= sb->blocks_per_node - prev->count) {
trace_off(warn(">>> can merge node %p into node %p", this, prev);)
merge_nodes(prev, this);
remove_index(path, level - 1);
set_buffer_dirty(hold[level].buffer);
brelse_free(sb, path[level].buffer);
goto keep_prev_node;
}
brelse(hold[level].buffer);
}
hold[level].buffer = path[level].buffer;
keep_prev_node:
if (!level) { /* remove levels if possible */
while (levels > 1 && path_node(hold, 0)->count == 1) {
trace_off(warn("drop btree level");)
sb->image.etree_root = hold[1].buffer->sector;
brelse_free(sb, hold[0].buffer);
levels = --sb->image.etree_levels;
memcpy(hold, hold + 1, levels * sizeof(hold[0]));
set_sb_dirty(sb);
}
brelse(prevleaf);
brelse_path(hold, levels);
return 0;
}
level--;
trace_off(printf("pop to level %i, %i of %i nodes\n", level, path[level].pnext - path_node(path, level)->entries, path_node(path, level)->count);)
} while (finished_level(path, level));
do { /* push back down to leaf level */
struct buffer *nodebuf = snapread(sb, path[level++].pnext++->sector);
path[level].buffer = nodebuf;
path[level].pnext = buffer2node(nodebuf)->entries;
trace_off(printf("push to level %i, %i nodes\n", level, path_node(path, level)->count);)
} while (level < levels - 1);
}
/* Exit if dirty buffer count above threshold */
// might incur a few extra index reads but oh well
if (dirty_buffer_count >= max_dirty) {
brelse(prevleaf);
brelse_path(path, levels);
for (i = 0; i < levels; i++)
if (hold[i].buffer)
brelse(hold[i].buffer);
// sb->delete_progress = ???;
return 1;
}
leafbuf = snapread(sb, path[level].pnext++->sector);
};
}
static int delete_snapshot(struct superblock *sb, unsigned tag)
{
struct snapshot *snapshot;
unsigned i, bit;
for (i = 0; i < sb->image.snapshots; i++)
if (sb->image.snaplist[i].tag == tag)
goto delete;
return -1;
delete:
snapshot = sb->image.snaplist + i;
bit = snapshot->bit;
trace_on(printf("Delete snapshot %i (internal %i)\n", tag, bit);)
memmove(snapshot, snapshot + 1, (char *)(sb->image.snaplist + --sb->image.snapshots) - (char *)snapshot);
sb->snapmask &= ~(1ULL << bit);
delete_snapshots_from_tree(sb, 1ULL << bit);
set_sb_dirty(sb);
return 0;
};
static void show_snapshots(struct superblock *sb)
{
unsigned snapnum, snapshots = sb->image.snapshots;
printf("%u snapshots\n", snapshots);
for (snapnum = 0; snapnum < snapshots; snapnum++) {
struct snapshot *snapshot = sb->image.snaplist + snapnum;
printf("snapshot %u tag %u prio %i created %x\n",
snapshot->bit,
snapshot->tag,
snapshot->prio,
snapshot->ctime);
}
};
/* Lock snapshot reads against origin writes */
static void reply(fd_t sock, struct messagebuf *message)
{
trace(warn("%x/%u", message->head.code, message->head.length);)
writepipe(sock, &message->head, message->head.length + sizeof(message->head));
}
struct client
{
u64 id;
fd_t sock;
int snap;
};
struct pending
{
unsigned holdcount;
struct client *client;
struct messagebuf message;
};
struct snaplock_wait
{
struct pending *pending;
struct snaplock_wait *next;
};
struct snaplock_hold
{
struct client *client;
struct snaplock_hold *next;
};
struct snaplock
{
struct snaplock_wait *waitlist;
struct snaplock_hold *holdlist;
struct snaplock *next;
chunk_t chunk;
};
struct snaplock *new_snaplock(struct superblock *sb)
{
return malloc(sizeof(struct snaplock));
}
struct snaplock_wait *new_snaplock_wait(struct superblock *sb)
{
return malloc(sizeof(struct snaplock_wait));
}
struct snaplock_hold *new_snaplock_hold(struct superblock *sb)
{
return malloc(sizeof(struct snaplock_hold));
}
void free_snaplock(struct superblock *sb, struct snaplock *p)
{
free(p);
}
void free_snaplock_hold(struct superblock *sb, struct snaplock_hold *p)
{
free(p);
}
void free_snaplock_wait(struct superblock *sb, struct snaplock_wait *p)
{
free(p);
}
unsigned snaplock_hash(struct superblock *sb, chunk_t chunk)
{
return ((u32)(chunk * 3498734713U)) >> (32 - sb->snaplock_hash_bits);
}
struct snaplock *find_snaplock(struct snaplock *list, chunk_t chunk)
{
for (; list; list = list->next)
if (list->chunk == chunk)
return list;
return NULL;
}
void waitfor_chunk(struct superblock *sb, chunk_t chunk, struct pending **pending)
{
struct snaplock *lock;
if ((lock = find_snaplock(sb->snaplocks[snaplock_hash(sb, chunk)], chunk))) {
if (!*pending) {
// arguably we should know the client and fill it in here
*pending = calloc(1, sizeof(struct pending));
(*pending)->holdcount = 1;
}
struct snaplock_wait *wait = new_snaplock_wait(sb);
wait->pending = *pending;
wait->next = lock->waitlist;
lock->waitlist = wait;
(*pending)->holdcount++;
}
}
void readlock_chunk(struct superblock *sb, chunk_t chunk, struct client *client)
{
struct snaplock **bucket = &sb->snaplocks[snaplock_hash(sb, chunk)];
struct snaplock *lock;
if (!(lock = find_snaplock(*bucket, chunk))) {
lock = new_snaplock(sb);
*lock = (struct snaplock){ .chunk = chunk, .next = *bucket };
*bucket = lock;
}
struct snaplock_hold *hold = new_snaplock_hold(sb);
hold->client = client;
hold->next = lock->holdlist;
lock->holdlist = hold;
}
struct snaplock *release_lock(struct superblock *sb, struct snaplock *lock, struct client *client)
{
struct snaplock *ret = lock;
struct snaplock_hold **holdp = &lock->holdlist;
while (*holdp && (*holdp)->client != client)
holdp = &(*holdp)->next;
if (!*holdp) {
trace_on(printf("chunk %Lx holder %Lu not found\n", lock->chunk, client->id);)
return NULL;
}
/* Delete and free holder record */
struct snaplock_hold *next = (*holdp)->next;
free_snaplock_hold(sb, *holdp);
*holdp = next;
if (lock->holdlist)
return ret;
/* Release and delete waiters, delete lock */
struct snaplock_wait *list = lock->waitlist;
while (list) {
struct snaplock_wait *next = list->next;
assert(list->pending->holdcount);
if (!--(list->pending->holdcount)) {
struct pending *pending = list->pending;
reply(pending->client->sock, &pending->message);
free(pending);
}
free_snaplock_wait(sb, list);
list = next;
}
ret = lock->next;
free_snaplock(sb, lock);
return ret;
}
int release_chunk(struct superblock *sb, chunk_t chunk, struct client *client)
{
trace(printf("release %Lx\n", chunk);)
struct snaplock **lockp = &sb->snaplocks[snaplock_hash(sb, chunk)];
/* Find pointer to lock record */
while (*lockp && (*lockp)->chunk != chunk)
lockp = &(*lockp)->next;
struct snaplock *next, *lock = *lockp;
if (!lock) {
trace_on(printf("chunk %Lx not locked\n", chunk);)
return -1;
}
next = release_lock(sb, lock, client);
if (!next)
return -2;
*lockp = next;
return 0;
}
void show_locks(struct superblock *sb)
{
unsigned n = 0, i;
for (i = 0; i < (1 << sb->snaplock_hash_bits); i++) {
struct snaplock *lock = sb->snaplocks[i];
if (!lock)
continue;
if (!n) printf("Locks:\n");
printf("[%03u] ", i);
do {
printf("chunk %Lx ", lock->chunk);
struct snaplock_hold *hold = lock->holdlist;
for (; hold; hold = hold->next)
printf("held by client %Lu ", hold->client->id);
struct snaplock_wait *wait = lock->waitlist;
for (; wait; wait = wait->next)
printf("wait [%02hx/%u] ", snaplock_hash(sb, (u32)wait->pending), wait->pending->holdcount);
} while ((lock = lock->next));
printf("\n");
n++;
}
if (!n) printf("-- no locks --\n");
}
/* Build up a response as a list of chunk ranges */
struct addto
{
unsigned count;
chunk_t firstchunk;
chunk_t nextchunk;
struct rwmessage *reply;
shortcount *countp;
chunk_t *top;
char *lim;
};
void check_response_full(struct addto *r, unsigned bytes)
{
if ((char *)r->top < r->lim - bytes)
return;
error("Need realloc");
}
void addto_response(struct addto *r, chunk_t chunk)
{
if (chunk != r->nextchunk) {
if (r->top) {
trace_off(warn("finish old range\n");)
*(r->countp) = (r->nextchunk - r->firstchunk);
} else {
trace_off(warn("alloc new reply");)
r->reply = (void *) malloc(sizeof(struct messagebuf));
r->top = (chunk_t *)(((char *)r->reply) + sizeof(struct head) + offsetof(struct rw_request, ranges));
r->lim = ((char *)r->reply) + maxbody;
r->count++;
}
trace_off(warn("start new range");)
check_response_full(r, 2*sizeof(chunk_t));
r->firstchunk = *(r->top)++ = chunk;
r->countp = (shortcount *)r->top;
r->top = (chunk_t *)(((shortcount *)r->top) + 1);
}
r->nextchunk = chunk + 1;
}
int finish_reply_(struct addto *r, unsigned code, unsigned id)
{
if (!r->countp)
return 0;
*(r->countp) = (r->nextchunk - r->firstchunk);
r->reply->head.code = code;
r->reply->head.length = (char *)r->top - (char *)r->reply - sizeof(struct head);
r->reply->body.id = id;
r->reply->body.count = r->count;
return 1;
}
void finish_reply(int sock, struct addto *r, unsigned code, unsigned id)
{
if (finish_reply_(r, code, id))
reply(sock, (struct messagebuf *)r->reply);
free(r->reply);
}
/* Initialization, State load/save */
void setup_sb(struct superblock *sb)
{
unsigned blocksize_bits = sb->image.blocksize_bits;
unsigned chunksize_bits = sb->image.blocksize_bits;
sb->blocksize = 1 << blocksize_bits;
sb->chunksize = 1 << chunksize_bits,
sb->sectors_per_block_bits = blocksize_bits - SECTOR_BITS;
sb->sectors_per_chunk_bits = chunksize_bits - SECTOR_BITS;
sb->blocks_per_node = (sb->blocksize - offsetof(struct enode, entries)) / sizeof(struct index_entry);
#ifdef BUSHY
sb->blocks_per_node = 10;
#endif
sb->copybuf = malloc_aligned(sb->copybuf_size = (32 * sb->chunksize), 4096); // !!! check failed
sb->sectors_per_block = 1 << sb->sectors_per_block_bits;
sb->sectors_per_chunk = 1 << sb->sectors_per_chunk_bits;
sb->snapmask = 0;
sb->flags = 0;
sb->max_commit_blocks = (sb->blocksize - sizeof(struct commit_block)) / sizeof(sector_t);
unsigned snaplock_hash_bits = 8;
sb->snaplock_hash_bits = snaplock_hash_bits;
sb->snaplocks = (struct snaplock **)calloc(1 << snaplock_hash_bits, sizeof(struct snaplock *));
}
void load_sb(struct superblock *sb)
{
struct buffer *buffer = bread(sb->snapdev, SB_SECTOR, SB_SIZE);
memcpy(&sb->image, buffer->data, sizeof(sb->image));
assert(!memcmp(sb->image.magic, SB_MAGIC, sizeof(sb->image.magic)));
brelse(buffer);
setup_sb(sb);
sb->snapmask = calc_snapmask(sb);
trace_on(printf("Active snapshot mask: %016llx\n", sb->snapmask);)
}
void save_sb(struct superblock *sb)
{
if (sb->flags & SB_DIRTY) {
struct buffer *buffer = getblk(sb->snapdev, SB_SECTOR, SB_SIZE);
memcpy(buffer->data, &sb->image, sizeof(sb->image));
write_buffer(buffer);
brelse(buffer);
sb->flags &= ~SB_DIRTY;
}
}
void save_state(struct superblock *sb)
{
flush_buffers();
save_sb(sb);
}
/*
* This source compiles either the snapshot server or the snapshot store setup
* utility, depending on whether the macro variable CREATE is defined.
*
* I'll leave all the testing hooks lying around in the main routine for now,
* since the low level components still tend to break every now and then and
* require further unit testing.
*/
int init_snapstore(struct superblock *sb)
{
int i, error;
unsigned sectors_per_block_bits = 3;
sb->image = (struct disksuper){ .magic = SB_MAGIC };
sb->image.etree_levels = 1,
sb->image.blocksize_bits = SECTOR_BITS + sectors_per_block_bits;
sb->image.chunksize_bits = sb->image.blocksize_bits; // !!! just for now
setup_sb(sb);
u64 size;
if ((error = fd_size(sb->snapdev, &size)))
error("Error %i: %s determining snapshot store size", error, strerror(error));
sb->image.chunks = size >> sb->image.chunksize_bits;
if ((error = fd_size(sb->orgdev, &size)))
error("Error %i: %s determining origin volume size", errno, strerror(errno));
sb->image.orgchunks = size >> sb->image.chunksize_bits;
sb->image.journal_size = 100;
#ifdef TEST_JOURNAL
sb->image.journal_size = 5;
#endif
sb->image.journal_next = 0;
sb->image.sequence = sb->image.journal_size;
init_allocation(sb);
set_sb_dirty(sb);
for (i = 0; i < sb->image.journal_size; i++) {
struct buffer *buffer = jgetblk(sb, i);
struct commit_block *commit = (struct commit_block *)buffer->data;
*commit = (struct commit_block){ .magic = JMAGIC, .sequence = i };
#ifdef TEST_JOURNAL
commit->sequence = (i + 3) % 5;
#endif
commit->checksum = -checksum_block(sb, (void *)commit);
brelse_dirty(buffer);
}
#ifdef TEST_JOURNAL
show_journal(sb);
show_tree(sb);
flush_buffers();
recover_journal(sb);
show_buffers();
#endif
#if 0
printf("chunk = %Lx\n", alloc_chunk_range(sb, sb->image.chunks - 1, 1));
// struct buffer *buffer = snapread(sb, sb->image.bitmap_base + 3 * 8);
// dump_buffer(buffer, 4090, 6);
return 0;
#endif
#if 0
grab_chunk(sb, 32769);
struct buffer *buffer = snapread(sb, sb->image.bitmap_base + 8);
printf("sector %Lx\n", buffer->sector);
free_chunk(sb, 32769);
return 0;
#endif
struct buffer *leafbuf = new_leaf(sb);
struct buffer *rootbuf = new_node(sb);
buffer2node(rootbuf)->count = 1;
buffer2node(rootbuf)->entries[0].sector = leafbuf->sector;
sb->image.etree_root = rootbuf->sector;
#if 0
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
// free_chunk(sb, 23);
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
return 0;
#endif
brelse_dirty(rootbuf);
brelse_dirty(leafbuf);
#if 0
struct buffer *leafbuf1 = new_leaf(sb);
struct buffer *leafbuf2 = new_leaf(sb);
struct eleaf *leaf1 = buffer2leaf(leafbuf1);
struct eleaf *leaf2 = buffer2leaf(leafbuf2);
init_leaf(leaf1, 256);
init_leaf(leaf2, 256);
add_exception_to_leaf(leaf1, 0x111, 0x11, 0, 3);
add_exception_to_leaf(leaf2, 0x222, 0x11, 1, 3);
add_exception_to_leaf(leaf2, 0x333, 0x33, 1, 3);
show_leaf(leaf1);
show_leaf(leaf2);
merge_leaves(leaf1, leaf2);
show_leaf(leaf1);
return 0;
#endif
#ifdef TEST_JOURNAL
show_buffers();
show_dirty_buffers();
commit_transaction(sb);
evict_buffers();
show_journal(sb);
show_tree(sb);
recover_journal(sb);
evict_buffers();
show_tree(sb);
#endif
save_state(sb);
return 0;
}
// Expand snapshot store:
// Calculate num bitmap blocks for new size
// Copy bitmap blocks to current top
// Clear new bitmap blocks
// Reserve new bitmap blocks
// Clear remainder bits in old last bitmap byte
// Set remainder bits in new last bitmap byte
// Set new bitmap base and chunks count
static void expand_snapstore(struct superblock *sb, u64 newchunks)
{
u64 oldchunks = sb->image.chunks;
unsigned oldbitmaps = sb->image.bitmap_blocks;
unsigned newbitmaps = calc_bitmap_blocks(sb, newchunks);
unsigned blocksize = sb->blocksize;
unsigned blockshift = sb->image.blocksize_bits;
u64 oldbase = sb->image.bitmap_base << SECTOR_BITS;
u64 newbase = sb->image.bitmap_base << SECTOR_BITS;
int i;
for (i = 0; i < oldbitmaps; i++) {
// do it one block at a time for now !!! sucks
// maybe should do copy with bread/write?
pread(sb->snapdev, sb->copybuf, blocksize, oldbase + (i << blockshift)); // 64 bit!!!
pwrite(sb->snapdev, sb->copybuf, blocksize, newbase + (i << blockshift)); // 64 bit!!!
}
if ((oldchunks & 7)) {
sector_t sector = (oldbase >> SECTOR_BITS) + ((oldbitmaps - 1) << sb->sectors_per_block_bits);
struct buffer *buffer = getblk(sb->snapdev, sector, blocksize);
buffer->data[(oldchunks >> 3) & (blocksize - 1)] &= ~(0xff << (oldchunks & 7));
brelse_dirty(buffer);
}
for (i = oldbitmaps; i < newbitmaps; i++) {
struct buffer *buffer = getblk(sb->snapdev, newbase >> SECTOR_BITS, blocksize);
memset(buffer->data, 0, sb->blocksize);
/* Suppress overrun allocation in partial last byte */
if (i == newbitmaps - 1 && (newchunks & 7))
buffer->data[(newchunks >> 3) & (blocksize - 1)] |= 0xff << (newchunks & 7);
brelse_dirty(buffer);
}
for (i = 0; i < newbitmaps; i++) {
grab_chunk(sb, (newbase >> blockshift) + i); // !!! assume blocksize = chunksize
}
sb->image.bitmap_base = newbase >> SECTOR_BITS;
sb->image.chunks = newchunks;
save_state(sb);
}
int client_locks(struct superblock *sb, struct client *client, int check)
{
int i;
for (i = 0; i < (1 << sb->snaplock_hash_bits); i++) {
struct snaplock **lockp = &sb->snaplocks[i];
while (*lockp) {
struct snaplock_hold *hold;
for (hold = (*lockp)->holdlist; hold; hold = hold->next)
if (hold->client == client) {
if (check)
return 1;
*lockp = release_lock(sb, *lockp, client);
goto next;
}
lockp = &(*lockp)->next;
next:
continue;
}
}
return 0;
}
#define check_client_locks(x, y) client_locks(x, y, 1)
#define free_client_locks(x, y) client_locks(x, y, 0)
/*
* Responses to IO requests take two quite different paths through the
* machinery:
*
* - Origin write requests are just sent back with their message
* code changed, unless they have to wait for a snapshot read
* lock in which case the incoming buffer is copied and the
* response takes a kafkaesque journey through the read locking
* beaurocracy.
*
* - Responses to snapshot read or write requests have to be built
* up painstakingly in allocated buffers, keeping a lot of state
* around so that they end up with a minimum number of contiguous
* chunk ranges. Once complete they can always be sent
* immediately.
*
* To mess things up further, snapshot read requests can return both
* a list of origin ranges and a list of snapshot store ranges. In
* the latter case the specific snapshot store chunks in each logical
* range are also returned, because they can (normally will) be
* discontiguous. This goes back to the client in two separate
* messages, on the theory that the client will find it nice to be
* able to process the origin read ranges and snapshot read chunks
* separately. We'll see how good an idea that is.
*
* The implementation ends up looking nice and tidy, but appearances
* can be deceiving.
*/
int incoming(struct superblock *sb, struct client *client)
{
struct messagebuf message;
unsigned sock = client->sock;
int i, j, err;
if ((err = readpipe(sock, &message.head, sizeof(message.head))))
goto pipe_error;
trace(warn("%x/%u", message.head.code, message.head.length);)
if (message.head.length > maxbody)
goto message_too_long;
if ((err = readpipe(sock, &message.body, message.head.length)))
goto pipe_error;
switch (message.head.code) {
case QUERY_WRITE:
if (client->snap == -1) {
struct pending *pending = NULL;
struct rw_request *body = (struct rw_request *)message.body;
struct chunk_range *p = body->ranges;
chunk_t chunk;
if (message.head.length < sizeof(*body))
goto message_too_short;
trace(printf("origin write query, %u ranges\n", body->count);)
for (i = 0; i < body->count; i++, p++)
for (j = 0, chunk = p->chunk; j < p->chunks; j++, chunk++) {
chunk_t exception = make_unique(sb, chunk, -1);
if (exception == -1)
error("snapshot store full"); // !!! don't abort, send error
if (exception)
waitfor_chunk(sb, chunk, &pending);
}
finish_copyout(sb);
commit_transaction(sb);
message.head.code = REPLY_ORIGIN_WRITE;
if (pending) {
pending->client = client;
memcpy(&pending->message, &message, message.head.length + sizeof(struct head));
pending->holdcount--;
break;
}
reply(sock, &message);
break;
}
struct rw_request *body = (struct rw_request *)message.body;
if (message.head.length < sizeof(*body))
goto message_too_short;
trace(printf("snapshot write request, %u ranges\n", body->count);)
struct addto snap = { .nextchunk = -1 };
for (i = 0; i < body->count; i++)
for (j = 0; j < body->ranges[i].chunks; j++) {
chunk_t chunk = body->ranges[i].chunk + j;
chunk_t exception = make_unique(sb, chunk, client->snap);
if (exception == -1)
error("snapshot store full"); // !!! don't abort, send error
trace(printf("exception = %Lx\n", exception);)
addto_response(&snap, chunk);
check_response_full(&snap, sizeof(chunk_t));
*(snap.top)++ = exception;
}
finish_copyout(sb);
commit_transaction(sb);
finish_reply(client->sock, &snap, REPLY_SNAPSHOT_WRITE, body->id);
break;
case QUERY_SNAPSHOT_READ:
{
struct rw_request *body = (struct rw_request *)message. body;
if (message.head.length < sizeof(*body))
goto message_too_short;
trace(printf("snapshot read request, %u ranges\n", body->count);)
struct addto snap = { .nextchunk = -1 }, org = { .nextchunk = -1 };
for (i = 0; i < body->count; i++)
for (j = 0; j < body->ranges[i].chunks; j++) {
chunk_t chunk = body->ranges[i].chunk + j, exception = 0;
trace(warn("read %Lx", chunk));
test_unique(sb, chunk, client->snap, &exception);
if (exception) {
trace(warn("read exception %Lx", exception));
addto_response(&snap, chunk);
check_response_full(&snap, sizeof(chunk_t));
*(snap.top)++ = exception;
} else {
trace(warn("read origin %Lx", chunk));
addto_response(&org, chunk);
readlock_chunk(sb, chunk, client);
}
}
finish_reply(client->sock, &org, REPLY_SNAPSHOT_READ_ORIGIN, body->id);
finish_reply(client->sock, &snap, REPLY_SNAPSHOT_READ, body->id);
break;
}
case FINISH_SNAPSHOT_READ:
{
struct rw_request *body = (struct rw_request *)message.body;
if (message.head.length < sizeof(*body))
goto message_too_short;
trace(printf("finish snapshot read, %u ranges\n", body->count);)
for (i = 0; i < body->count; i++)
for (j = 0; j < body->ranges[i].chunks; j++)
release_chunk(sb, body->ranges[i].chunk + j, client);
break;
}
case IDENTIFY:
{
int tag = ((struct identify *)message.body)->snap, snap = tag_snapnum(sb, tag);
if (snap >= 0)
client->snap = snap;
client->id = ((struct identify *)message.body)->id;
warn("client id %Li, snapshot %i (snapnum %i)", client->id, tag, snap);
outbead(sock, REPLY_IDENTIFY, struct { });
break;
}
case UPLOAD_LOCK:
break;
case FINISH_UPLOAD_LOCK:
break;
case CREATE_SNAPSHOT:
create_snapshot(sb, ((struct create_snapshot *)message.body)->snap);
save_state(sb);
outbead(sock, REPLY_CREATE_SNAPSHOT, struct { });
break;
case DELETE_SNAPSHOT:
delete_snapshot(sb, ((struct create_snapshot *)message.body)->snap);
save_state(sb);
outbead(sock, REPLY_DELETE_SNAPSHOT, struct { });
break;
case INITIALIZE_SNAPSTORE:
init_snapstore(sb);
break;
case DUMP_TREE:
show_tree(sb);
break;
case START_SERVER:
warn("Activating server");
load_sb(sb);
if (sb->image.flags & SB_BUSY) {
warn("Server was not shut down properly");
jtrace(show_journal(sb);)
recover_journal(sb);
} else {
sb->image.flags |= SB_BUSY;
set_sb_dirty(sb);
save_sb(sb);
}
break;
case LIST_SNAPSHOTS:
{
int i, n = sb->image.snapshots;
struct snapshot *snap = sb->image.snaplist;
outhead(sock, SNAPSHOT_LIST, sizeof(struct snaplist) + n * sizeof(struct snapinfo));
for (i = 0; i < n; i++)
write(sock, &(struct snapinfo){
.snap = snap->tag,
.prio = snap->prio,
.ctime = snap->ctime },
sizeof(struct snapinfo));
break;
}
case SHUTDOWN_SERVER:
return -2;
default:
outbead(sock, REPLY_ERROR, struct { int code; char error[50]; }, message.head.code, "Unknown message"); // wrong!!!
}
#if 0
static int messages = 0;
if (++messages == 5) {
warn(">>>>Simulate server crash<<<<");
exit(1);
}
#endif
return 0;
message_too_long:
warn("message %x too long (%u bytes)\n", message.head.code, message.head.length);
return -1;
message_too_short:
warn("message %x too short (%u bytes)\n", message.head.code, message.head.length);
return -1;
pipe_error:
return -1; /* we quietly drop the client if the connect breaks */
}
/* Signal Delivery via pipe */
static int sigpipe;
void sighandler(int signum)
{
trace_off(printf("caught signal %i\n", signum);)
write(sigpipe, (char[]){signum}, 1);
}
int cleanup(struct superblock *sb)
{
warn("cleaning up");
sb->image.flags &= ~SB_BUSY;
set_sb_dirty(sb);
save_state(sb);
return 0;
}
int resolve_host(char *name, int family, void *result, int length)
{
struct hostent host, *bogus;
char work[500];
int err, dumb;
if ((err = gethostbyname2_r(name, family, &host, work, sizeof(work), &bogus, &dumb))) {
errno = err;
return -1;
}
memcpy(result, host.h_addr_list[0], host.h_length);
return host.h_length;
}
int resolve_self(int family, void *result, int length)
{
char name[HOST_NAME_MAX + 1];
if (gethostname(name, HOST_NAME_MAX) == -1)
return -1;
return resolve_host(name, family, result, length);
}
int snap_server(struct superblock *sb, const char *sockname, int port)
{
unsigned maxclients = 100, clients = 0, others = 3;
struct client *clientvec[maxclients];
struct pollfd pollvec[others+maxclients];
int listener, getsig, pipevec[2], err = 0;
if (pipe(pipevec))
error("Can't open pipe");
sigpipe = pipevec[1];
getsig = pipevec[0];
struct server server = { .port = htons(port), .type = AF_INET, };
if ((listener = socket(AF_INET, SOCK_STREAM, 0)) < 0)
error("Can't get socket");
if (bind(listener,
(struct sockaddr *)&(struct sockaddr_in){
.sin_family = server.type,
.sin_port = server.port,
.sin_addr = { .s_addr = INADDR_ANY } },
sizeof(struct sockaddr_in)) < 0)
error("Can't bind to socket");
listen(listener, 5);
warn("snap server bound to port %i", port);
/* Get agent connection */
struct sockaddr_un addr = { .sun_family = AF_UNIX };
int addr_len = sizeof(addr) - sizeof(addr.sun_path) + strlen(sockname);
int sock, len;
trace(warn("Connect to control socket %s", sockname);)
if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1)
error("Can't get socket");
strncpy(addr.sun_path, sockname, sizeof(addr.sun_path));
if (sockname[0] == '@')
addr.sun_path[0] = 0;
if (connect(sock, (struct sockaddr *)&addr, addr_len) == -1)
error("Can't connect to control socket");
trace_on(warn("Received control connection");)
pollvec[2] = (struct pollfd){ .fd = sock, .events = POLLIN };
if ((len = resolve_self(AF_INET, server.address, sizeof(server.address))) == -1)
error("Can't get own address, %s (%i)", strerror(errno), errno);
server.address_len = len;
warn("host = %x/%u", *(int *)server.address, server.address_len);
writepipe(sock, &(struct head){ SERVER_READY, sizeof(server) }, sizeof(struct head));
writepipe(sock, &server, sizeof(server));
#if 1
switch (fork()) {
case -1:
error("fork failed");
case 0: // !!! daemonize goes here
break;
default:
return 0;
}
#endif
pollvec[0] = (struct pollfd){ .fd = listener, .events = POLLIN };
pollvec[1] = (struct pollfd){ .fd = getsig, .events = POLLIN };
signal(SIGINT, sighandler);
signal(SIGTERM, sighandler);
signal(SIGPIPE, SIG_IGN);
while (1) {
int activity = poll(pollvec, others+clients, -1);
if (activity < 0) {
if (errno != EINTR)
error("poll failed, %s", strerror(errno));
continue;
}
if (!activity) {
printf("waiting...\n");
continue;
}
/* New connection? */
if (pollvec[0].revents) {
struct sockaddr_in addr;
int addr_len = sizeof(addr), sock;
if (!(sock = accept(listener, (struct sockaddr *)&addr, &addr_len)))
error("Cannot accept connection");
trace_on(warn("Received connection");)
assert(clients < maxclients); // !!! send error and disconnect
struct client *client = malloc(sizeof(struct client));
*client = (struct client){ .sock = sock };
clientvec[clients] = client;
pollvec[others+clients] = (struct pollfd){ .fd = sock, .events = POLLIN };
clients++;
}
/* Signal? */
if (pollvec[1].revents) {
u8 sig = 0;
/* it's stupid but this read also gets interrupted, so... */
do { } while (read(getsig, &sig, 1) == -1 && errno == EINTR);
trace_on(warn("caught signal %i", sig);)
cleanup(sb); // !!! don't do it on segfault
if (sig == SIGINT) {
signal(SIGINT, SIG_DFL);
kill(getpid(), sig); /* commit harikiri */
}
goto done;
}
/* Agent message? */
if (pollvec[2].revents)
incoming(sb, &(struct client){ .sock = sock, .id = -2, .snap = -2 });
/* Client message? */
unsigned i = 0;
while (i < clients) {
if (pollvec[others+i].revents) { // !!! check for poll error
struct client *client = clientvec[i];
int result;
trace_off(printf("event on socket %i = %x\n", client->sock, pollvec[others+i].revents);)
if ((result = incoming(sb, client)) == -1) {
warn("Client %Li disconnected", client->id);
save_state(sb); // !!! just for now
close(client->sock);
free(client);
--clients;
clientvec[i] = clientvec[clients];
pollvec[others + i] = pollvec[others + clients];
continue;
}
if (result == -2) { // !!! wrong !!!
cleanup(sb);
goto done;
}
}
i++;
}
}
done:
// in a perfect world we'd close all the connections
close(listener);
return err;
}
#if 0
void usage(poptContext optCon, int exitcode, char *error, char *addl) {
poptPrintUsage(optCon, stderr, 0);
if (error) fprintf(stderr, "%s: %s0", error, addl);
exit(exitcode);
}
#endif
static void *useme[] = {
_show_journal, delete_tree_range, expand_snapstore,
show_tree_range, snapnum_tag, show_snapshots,
};
int main(int argc, const char *argv[])
{
poptContext optCon;
unsigned volsize;
struct poptOption optionsTable[] = {
{ "size", 's', POPT_ARG_INT, &volsize, 0, "volume size", "size" },
POPT_AUTOHELP
{ NULL, 0, 0, NULL, 0 }
};
optCon = poptGetContext(NULL, argc, argv, optionsTable, 0);
#ifdef SERVER
poptSetOtherOptionHelp(optCon, "dev/snapshot dev/origin socket port");
if (argc < 5) {
#else
poptSetOtherOptionHelp(optCon, "dev/snapshot dev/origin");
if (argc < 3) {
#endif
poptPrintUsage(optCon, stderr, 0);
exit(1);
}
char c;
while ((c = poptGetNextOpt(optCon)) >= 0)
;
#if 0
snapname = poptGetArg(optCon);
if(!(poptPeekArg(optCon) == NULL)) {
poptPrintUsage(optCon, stderr, 0);
exit(1);
}
#endif
if (c < -1) {
fprintf(stderr, "%s: %s\n",
poptBadOption(optCon, POPT_BADOPTION_NOALIAS),
poptStrerror(c));
return 1;
}
struct superblock *sb = &(struct superblock){};
init_buffers();
#ifdef SERVER
if (argc < 5)
error("usage: %s dev/snapshot dev/origin socket port", argv[0]);
#else
if (argc < 3)
error("usage: %s dev/snapshot dev/origin", argv[0]);
#endif
if (!(sb->snapdev = open(poptGetArg(optCon), O_RDWR | O_DIRECT)))
error("Could not open snapshot store %s", argv[1]);
if (!(sb->orgdev = open(poptGetArg(optCon), O_RDONLY | O_DIRECT)))
error("Could not open origin volume %s", argv[2]);
#ifdef SERVER
const char *sockname = poptGetArg(optCon);
int port = atoi(poptGetArg(optCon));
poptFreeContext(optCon);
return snap_server(sb, sockname, port);
#else
poptFreeContext(optCon);
#if 0
init_snapstore(sb);
create_snapshot(sb, 0);
int i;
for (i = 0; i < 100; i++) {
make_unique(sb, i, 0);
}
flush_buffers();
evict_buffers();
warn("delete...");
delete_tree_range(sb, 1, 0, 5);
show_buffers();
warn("dirty buffers = %i", dirty_buffer_count);
show_tree(sb);
return 0;
#endif
return init_snapstore(sb);
#endif
if (useme) ;
}
File Metadata
Details
Attached
Mime Type
text/x-c
Expires
Mon, Feb 24, 2:18 PM (9 h, 42 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1464258
Default Alt Text
ddsnap.c (74 KB)
Attached To
Mode
rF Fence Agents
Attached
Detach File
Event Timeline
Log In to Comment