Page MenuHomeClusterLabs Projects

ddsnap.c
No OneTemporary

ddsnap.c

/*
* Clustered Snapshot Metadata Server
*
* Daniel Phillips, Nov 2003 to May 2004
* (c) 2003 Sistina Software Inc.
* (c) 2004 Red Hat Software Inc.
*
*/
#define _GNU_SOURCE /* Berserk glibc headers: O_DIRECT not defined unless _GNU_SOURCE defined */
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <time.h>
#include <signal.h>
#include <sys/poll.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/ioctl.h>
#include <netinet/in.h>
#include <netdb.h> // gethostbyname2_r
#include <linux/fs.h> // BLKGETSIZE
#include <popt.h>
#include "list.h"
#include "buffer.h"
#include "ddsnap.h"
#include "../dm-ddsnap.h"
#include "trace.h"
#define trace trace_off
#define jtrace trace_off
#define BUSHY
/*
Todo:
BTree
* coalesce leafs/nodes for delete
- B*Tree splitting
Allocation bitmaps
- allocation statistics
- Per-snapshot free space as full-tree pass
- option to track specific snapshot(s) on the fly
- return stats to client (on demand? always?)
* Bitmap block radix tree - resizing
- allocation policy
Journal
\ allocation
\ write commit block
\ write target blocks
\ recovery
- stats, misc data in commit block?
File backing
\ double linked list ops
- buffer lru
- buffer writeout policy
- buffer eviction policy
- verify no busy buffers between operations
Snapshot vs origin locking
- anti-starvation measures
Message handling
- send reply on async write completion
- build up immediate reply in separate buffer
Snapshot handling path
- background copyout thread
- try AIO
- coalesce leaves/nodes on delete
- should wait for current queries on snap to complete
- background deletion optimization
- record current deletion list in superblock
Multithreading
- separate thread for copyouts
- separate thread for buffer flushing
- separate thread for new connections (?)
Utilities
- don't include anything not needed for create
- snapshot store integrity check (snapcheck)
Failover
\ Mark superblock active/inactive
+ upload client locks on server restart
+ release snapshot read locks for dead client
- Examine entire tree to initialize statistics after unsaved halt
General
\ Prevent multiple server starts on same snapshot store
+ More configurable tracing
- Add more internal consistency checks
- Magic number + version for superblock
- Flesh out and audit error paths
- Make it endian-neutral
- Verify wordsize neutral
- Add an on-the-fly verify path
+ strip out the unit testing gunk
+ More documentation
- Audits and more audits
- Memory inversion prevention
- failed chunk alloc fails operation, no abort
Cluster integration
+ Restart/Error recovery/reporting
*/
/*
* Miscellaneous Primitives
*/
typedef int fd_t;
/*
* Ripped from libiddev. It's not quite ugly enough to convince me to
* add a new dependency on a library that nobody has yet, but it's close.
*/
static int fd_size(int fd, u64 *bytes)
{
struct stat stat;
int error;
if ((error = fstat(fd, &stat)))
return error;
if (S_ISREG(stat.st_mode)) {
*bytes = stat.st_size;
return 0;
}
if ((error = ioctl(fd, BLKGETSIZE64, bytes))) {
unsigned sectors;
if ((error = ioctl(fd, BLKGETSIZE, &sectors)))
return error;
*bytes = ((u64)sectors) << 9;
}
return 0;
}
void hexdump(void *data, unsigned length)
{
while (length ) {
int row = length < 16? length: 16;
printf("%p: ", data);
length -= row;
while (row--)
printf("%02hhx ", *(unsigned char *)data++);
printf("\n");
}
}
/* BTree Operations */
/* Directory at the base of the leaf block */
#define MAX_SNAPSHOTS 64
struct enode
{
u32 count;
u32 unused;
struct index_entry
{
u64 key; // note: entries[0].key never accessed
sector_t sector; // node sector address goes here
} entries[];
};
struct eleaf
{
le_u16 magic;
le_u16 version;
le_u32 count;
le_u64 base_chunk;
le_u64 using_mask;
struct etree_map
{
le_u32 offset;
le_u32 rchunk;
}
map[];
};
static inline struct enode *buffer2node(struct buffer *buffer)
{
return (struct enode *)buffer->data;
}
static inline struct eleaf *buffer2leaf(struct buffer *buffer)
{
return (struct eleaf *)buffer->data;
}
/* On-disk Format */
struct exception
{
le_u64 share;
le_u64 chunk;
};
static inline struct exception *emap(struct eleaf *leaf, unsigned i)
{
return (struct exception *)((char *) leaf + leaf->map[i].offset);
}
struct superblock
{
/* Persistent, saved to disk */
struct disksuper
{
char magic[8];
u64 create_time;
sector_t etree_root;
sector_t bitmap_base;
sector_t chunks, freechunks;
sector_t orgchunks;
chunk_t last_alloc;
u64 flags;
u32 blocksize_bits, chunksize_bits;
u64 deleting;
struct snapshot
{
u32 ctime; // relative to middle 32 bits of super ctime
u32 tag; // external name of snapshot
u8 bit; // internal snapshot number
s8 prio; // 0=normal, 127=highest, -128=lowestdrop
char reserved[7];
} snaplist[MAX_SNAPSHOTS];
u32 snapshots;
u32 etree_levels;
u32 bitmap_blocks;
s32 journal_base, journal_next, journal_size;
u32 sequence;
} image;
/* Derived, not saved to disk */
u64 snapmask;
u32 blocksize, chunksize, blocks_per_node;
u32 sectors_per_block_bits, sectors_per_block;
u32 sectors_per_chunk_bits, sectors_per_chunk;
unsigned flags;
unsigned snapdev, orgdev;
unsigned snaplock_hash_bits;
struct snaplock **snaplocks;
unsigned copybuf_size;
char *copybuf;
chunk_t source_chunk;
chunk_t dest_exception;
unsigned copy_chunks;
unsigned max_commit_blocks;
};
#define SB_BUSY 1
#define SB_MAGIC "snapshot"
/* Journal handling */
#define JMAGIC "MAGICNUM"
struct commit_block
{
char magic[8];
u32 checksum;
s32 sequence;
u32 entries;
u64 sector[];
} PACKED;
static sector_t journal_sector(struct superblock *sb, unsigned i)
{
return sb->image.journal_base + (i << sb->sectors_per_block_bits);
}
static inline struct commit_block *buf2block(struct buffer *buf)
{
return (void *)buf->data;
}
unsigned next_journal_block(struct superblock *sb)
{
unsigned next = sb->image.journal_next;
if (++sb->image.journal_next == sb->image.journal_size)
sb->image.journal_next = 0;
return next;
}
static int is_commit_block(struct commit_block *block)
{
return !memcmp(&block->magic, JMAGIC, sizeof(block->magic));
}
static u32 checksum_block(struct superblock *sb, u32 *data)
{
int i, sum = 0;
for (i = 0; i < sb->image.blocksize_bits >> 2; i++)
sum += data[i];
return sum;
}
static struct buffer *jgetblk(struct superblock *sb, unsigned i)
{
return getblk(sb->snapdev, journal_sector(sb, i), sb->blocksize);
}
static struct buffer *jread(struct superblock *sb, unsigned i)
{
return bread(sb->snapdev, journal_sector(sb, i), sb->blocksize);
}
/*
* For now there is only ever one open transaction in the journal, the newest
* one, so we don't have to check for journal wrap, but just ensure that each
* transaction stays small enough to fit in the journal.
*
* Since we don't have any asynchronous IO at the moment, journal commit is
* straightforward: walk through the dirty blocks once, writing them to the
* journal, then again, adding sector locations to the commit block. We know
* the dirty list didn't change between the two passes. When ansynchronous
* IO arrives here, this all has to be handled a lot more carefully.
*/
static void commit_transaction(struct superblock *sb)
{
// flush_buffers();
// return;
if (list_empty(&dirty_buffers))
return;
struct list_head *list;
list_for_each(list, &dirty_buffers) {
struct buffer *buffer = list_entry(list, struct buffer, list);
unsigned pos = next_journal_block(sb);
jtrace(warn("journal data sector = %Lx [%u]", buffer->sector, pos);)
assert(buffer_dirty(buffer));
write_buffer_to(buffer, journal_sector(sb, pos));
}
unsigned pos = next_journal_block(sb);
struct buffer *commit_buffer = jgetblk(sb, pos);
struct commit_block *commit = buf2block(commit_buffer);
*commit = (struct commit_block){ .magic = JMAGIC, .sequence = sb->image.sequence++ };
while (!list_empty(&dirty_buffers)) {
struct list_head *entry = dirty_buffers.next;
struct buffer *buffer = list_entry(entry, struct buffer, list);
jtrace(warn("write data sector = %Lx", buffer->sector);)
assert(buffer_dirty(buffer));
assert(commit->entries < sb->max_commit_blocks);
commit->sector[commit->entries++] = buffer->sector;
write_buffer(buffer); // deletes it from dirty (fixme: fragile)
// we hope the order we just listed these is the same as committed above
}
jtrace(warn("commit journal block [%u]", pos);)
commit->checksum = 0;
commit->checksum = -checksum_block(sb, (void *)commit);
write_buffer_to(commit_buffer, journal_sector(sb, pos));
brelse(commit_buffer);
}
int recover_journal(struct superblock *sb)
{
struct buffer *buffer;
typeof(((struct commit_block *)NULL)->sequence) sequence;
int scribbled = -1, last_block = -1, newest_block = -1;
int data_from_start = 0, data_from_last = 0;
int size = sb->image.journal_size;
char *why = "";
unsigned i;
/* Scan full journal, find newest commit */
for (i = 0; i < size; brelse(buffer), i++) {
buffer = jread(sb, i);
struct commit_block *block = buf2block(buffer);
if (!is_commit_block(block)) {
jtrace(warn("[%i] <data>", i);)
if (sequence == -1)
data_from_start++;
else
data_from_last++;
continue;
}
if (checksum_block(sb, (void *)block)) {
warn("block %i failed checksum", i);
hexdump(block, 40);
if (scribbled != -1) {
why = "Too many scribbled blocks in journal";
goto failed;
}
if (newest_block != -1 && newest_block != last_block) {
why = "Bad block not last written";
goto failed;
}
scribbled = i;
if (last_block != -1)
newest_block = last_block;
sequence++;
continue;
}
jtrace(warn("[%i] seq=%i", i, block->sequence);)
if (last_block != -1 && block->sequence != sequence + 1) {
int delta = sequence - block->sequence;
if (delta <= 0 || delta > size) {
why = "Bad sequence";
goto failed;
}
if (newest_block != -1) {
why = "Multiple sequence wraps";
goto failed;
}
if (!(scribbled == -1 || scribbled == i - 1)) {
why = "Bad block not last written";
goto failed;
}
newest_block = last_block;
}
data_from_last = 0;
last_block = i;
sequence = block->sequence;
}
if (last_block == -1) {
why = "No commit blocks found";
goto failed;
}
if (newest_block == -1) {
/* test for all the legal scribble positions here */
newest_block = last_block;
}
jtrace(warn("found newest commit [%u]", newest_block);)
buffer = jread(sb, newest_block);
struct commit_block *commit = buf2block(buffer);
unsigned entries = commit->entries;
for (i = 0; i < entries; i++) {
unsigned pos = (newest_block - entries + i + size) % size;
struct buffer *databuf = jread(sb, pos);
struct commit_block *block = buf2block(databuf);
if (is_commit_block(block)) {
error("data block [%u] marked as commit block", pos);
continue;
}
jtrace(warn("write journal [%u] data to %Lx", pos, commit->sector[i]);)
write_buffer_to(databuf, commit->sector[i]);
brelse(databuf);
}
sb->image.journal_next = (newest_block + 1 + size) % size;
sb->image.sequence = commit->sequence + 1;
brelse(buffer);
return 0;
failed:
errno = EIO; /* return a misleading error (be part of the problem) */
error("Journal recovery failed, %s", why);
return -1;
}
static void _show_journal(struct superblock *sb)
{
int i, j;
for (i = 0; i < sb->image.journal_size; i++) {
struct buffer *buf = jread(sb, i);
struct commit_block *block = buf2block(buf);
if (!is_commit_block(block)) {
printf("[%i] <data>\n", i);
continue;
}
printf("[%i] seq=%i (%i)", i, block->sequence, block->entries);
for (j = 0; j < block->entries; j++)
printf(" %Lx", (long long)block->sector[j]);
printf("\n");
brelse(buf);
}
printf("\n");
}
#define show_journal(sb) do { warn("Journal..."); _show_journal(sb); } while (0)
/* BTree leaf operations */
/*
* We operate directly on the BTree leaf blocks to insert exceptions and
* to enquire the sharing status of given chunks. This means all the data
* items in the block need to be properly aligned for architecture
* independence. To save space and to permit binary search a directory
* map at the beginning of the block points at the exceptions stored
* at the top of the block. The difference between two successive directory
* pointers gives the number of distinct exceptions for a given chunk.
* Each exception is paired with a bitmap that specifies which snapshots
* the exception belongs to.
*
* The chunk addresses in the leaf block directory are relative to a base
* chunk to save space. These are currently 32 bit values but may become
* 16 bits values. Since each is paired with a pointer into the list of
* exceptions, 16 bit emap entries would limit the blocksize to 64K.
*
* A mask in the leaf block header specifies which snapshots are actually
* encoded in the chunk. This allows lazy deletion (almost, needs fixing)
*
* The leaf operations need to know the size of the block somehow.
* Currently that is accomplished by inserting the block size as a sentinel
* in the block directory map; this may change.
*
* When an exception is created by a write to the origin it is initially
* shared by all snapshots that don't already have exceptions. Snapshot
* writes may later unshare some of these exceptions.
*/
/*
* To do:
* - Check leaf, index structure
* - Mechanism for identifying which snapshots are in each leaf
* - binsearch for leaf, index lookup
* - enforce 32 bit address range within leaf
*/
struct buffer *snapread(struct superblock *sb, sector_t sector)
{
return bread(sb->snapdev, sector, sb->blocksize);
}
void show_leaf(struct eleaf *leaf)
{
struct exception *p;
int i;
printf("%i chunks: ", leaf->count);
for (i = 0; i < leaf->count; i++) {
printf("%x=", leaf->map[i].rchunk);
// printf("@%i ", leaf->map[i].offset);
for (p = emap(leaf, i); p < emap(leaf, i+1); p++)
printf("%Lx/%08llx%s", p->chunk, p->share, p+1 < emap(leaf, i+1)? ",": " ");
}
// printf("top@%i", leaf->map[i].offset);
printf("\n");
}
/*
* origin_chunk_unique: an origin logical chunk is shared unless all snapshots
* have exceptions.
*/
int origin_chunk_unique(struct eleaf *leaf, u64 chunk, u64 snapmask)
{
u64 using = 0;
unsigned i, target = chunk - leaf->base_chunk;
struct exception *p;
for (i = 0; i < leaf->count; i++)
if (leaf->map[i].rchunk == target)
goto found;
return !snapmask;
found:
for (p = emap(leaf, i); p < emap(leaf, i+1); p++)
using |= p->share;
return !(~using & snapmask);
}
/*
* snapshot_chunk_unique: a snapshot logical chunk is shared if it has no
* exception or has the same exception as another snapshot. In any case
* if the chunk has an exception we need to know the exception address.
*/
int snapshot_chunk_unique(struct eleaf *leaf, u64 chunk, int snapshot, u64 *exception)
{
u64 mask = 1LL << snapshot;
unsigned i, target = chunk - leaf->base_chunk;
struct exception *p;
for (i = 0; i < leaf->count; i++)
if (leaf->map[i].rchunk == target)
goto found;
return 0;
found:
for (p = emap(leaf, i); p < emap(leaf, i+1); p++)
/* shared if more than one bit set including this one */
if ((p->share & mask)) {
*exception = p->chunk;
// printf("unique %Lx %Lx\n", p->share, mask);
return !(p->share & ~mask);
}
return 0;
}
/*
* add_exception_to_leaf:
* - cycle through map to find existing logical chunk or insertion point
* - if not found need to add new chunk address
* - move tail of map up
* - store new chunk address in map
* - otherwise
* - for origin:
* - or together all sharemaps, invert -> new map
* - for snapshot:
* - clear out bit for existing exception
* - if sharemap zero warn and reuse this location
* - insert new exception
* - move head of exceptions down
* - store new exception/sharemap
* - adjust map head offsets
*
* If the new exception won't fit in the leaf, return an error so that
* higher level code may split the leaf and try again. This keeps the
* leaf-editing code complexity down to a dull roar.
*/
unsigned leaf_freespace(struct eleaf *leaf)
{
char *maptop = (char *)(&leaf->map[leaf->count + 1]); // include sentinel
return (char *)emap(leaf, 0) - maptop;
}
unsigned leaf_payload(struct eleaf *leaf)
{
int lower = (char *)(&leaf->map[leaf->count]) - (char *)leaf->map;
int upper = (char *)emap(leaf, leaf->count) - (char *)emap(leaf, 0);
return lower + upper;
}
int add_exception_to_leaf(struct eleaf *leaf, u64 chunk, u64 exception, int snapshot, u64 active)
{
unsigned i, j, target = chunk - leaf->base_chunk;
u64 mask = 1ULL << snapshot, sharemap;
struct exception *ins, *exceptions = emap(leaf, 0);
char *maptop = (char *)(&leaf->map[leaf->count + 1]); // include sentinel
int free = (char *)exceptions - maptop
#ifdef BUSHY
- 10
#endif
;
trace(warn("chunk %Lx exception %Lx, snapshot = %i", chunk, exception, snapshot);)
for (i = 0; i < leaf->count; i++) // !!! binsearch goes here
if (leaf->map[i].rchunk >= target)
break;
if (i == leaf->count || leaf->map[i].rchunk > target) {
if (free < sizeof(struct exception) + sizeof(struct etree_map))
return -EFULL;
ins = emap(leaf, i);
memmove(&leaf->map[i+1], &leaf->map[i], maptop - (char *)&leaf->map[i]);
leaf->map[i].offset = (char *)ins - (char *)leaf;
leaf->map[i].rchunk = target;
leaf->count++;
sharemap = snapshot == -1? active: mask;
goto insert;
}
if (free < sizeof(struct exception))
return -EFULL;
if (snapshot == -1) {
for (sharemap = 0, ins = emap(leaf, i); ins < emap(leaf, i+1); ins++)
sharemap |= ins->share;
sharemap = (~sharemap) & active;
} else {
for (ins = emap(leaf, i); ins < emap(leaf, i+1); ins++)
if ((ins->share & mask)) {
ins->share &= ~mask;
break;
}
sharemap = mask;
}
ins = emap(leaf, i);
insert:
memmove(exceptions - 1, exceptions, (char *)ins - (char *)exceptions);
ins--;
ins->share = sharemap;
ins->chunk = exception;
for (j = 0; j <= i; j++)
leaf->map[j].offset -= sizeof(struct exception);
return 0;
}
/*
* split_leaf: Split one leaf into two approximately in the middle. Copy
* the upper half of entries to the new leaf and move the lower half of
* entries to the top of the original block.
*/
u64 split_leaf(struct eleaf *leaf, struct eleaf *leaf2)
{
unsigned i, nhead = (leaf->count + 1) / 2, ntail = leaf->count - nhead, tailsize;
/* Should split at middle of data instead of median exception */
u64 splitpoint = leaf->map[nhead].rchunk + leaf->base_chunk;
char *phead, *ptail;
phead = (char *)emap(leaf, 0);
ptail = (char *)emap(leaf, nhead);
tailsize = (char *)emap(leaf, leaf->count) - ptail;
/* Copy upper half to new leaf */
memcpy(leaf2, leaf, offsetof(struct eleaf, map)); // header
memcpy(&leaf2->map[0], &leaf->map[nhead], (ntail + 1) * sizeof(struct etree_map)); // map
memcpy(ptail - (char *)leaf + (char *)leaf2, ptail, tailsize); // data
leaf2->count = ntail;
/* Move lower half to top of block */
memmove(phead + tailsize, phead, ptail - phead);
leaf->count = nhead;
for (i = 0; i <= nhead; i++) // also adjust sentinel
leaf->map[i].offset += tailsize;
leaf->map[nhead].rchunk = 0; // tidy up
return splitpoint;
}
void merge_leaves(struct eleaf *leaf, struct eleaf *leaf2)
{
unsigned nhead = leaf->count, ntail = leaf2->count, i;
unsigned tailsize = (char *)emap(leaf2, ntail) - (char *)emap(leaf2, 0);
char *phead = (char *)emap(leaf, 0), *ptail = (char *)emap(leaf, nhead);
// adjust pointers
for (i = 0; i <= nhead; i++) // also adjust sentinel
leaf->map[i].offset -= tailsize;
// move data down
phead = (char *)emap(leaf, 0);
ptail = (char *)emap(leaf, nhead);
memmove(phead, phead + tailsize, ptail - phead);
// move data from leaf2 to top
memcpy(ptail, (char *)emap(leaf2, 0), tailsize); // data
memcpy(&leaf->map[nhead], &leaf2->map[0], (ntail + 1) * sizeof(struct etree_map)); // map
leaf->count += ntail;
}
void merge_nodes(struct enode *node, struct enode *node2)
{
memcpy(&node->entries[node->count], &node2->entries[0], node2->count * sizeof(struct index_entry));
node->count += node2->count;
}
void init_leaf(struct eleaf *leaf, int block_size)
{
leaf->magic = 0x1eaf;
leaf->version = 0;
leaf->base_chunk = 0;
leaf->count = 0;
leaf->map[0].offset = block_size;
#ifdef BUSHY
leaf->map[0].offset = 200;
#endif
}
/*
* Chunk allocation via bitmaps
*/
#define SB_DIRTY 1
#define SB_SECTOR 8
#define SB_SIZE 4096
void set_sb_dirty(struct superblock *sb)
{
sb->flags |= SB_DIRTY;
}
static inline int get_bitmap_bit(unsigned char *bitmap, unsigned bit)
{
return bitmap[bit >> 3] & (1 << (bit & 7));
}
static inline void set_bitmap_bit(unsigned char *bitmap, unsigned bit)
{
bitmap[bit >> 3] |= 1 << (bit & 7);
}
static inline void clear_bitmap_bit(unsigned char *bitmap, unsigned bit)
{
bitmap[bit >> 3] &= ~(1 << (bit & 7));
}
static unsigned calc_bitmap_blocks(struct superblock *sb, u64 chunks)
{
unsigned chunkshift = sb->image.chunksize_bits;
return (chunks + (1 << (chunkshift + 3)) - 1) >> (chunkshift + 3);
}
static void init_allocation(struct superblock *sb)
{
u64 chunks = sb->image.chunks;
unsigned bitmaps = calc_bitmap_blocks(sb, chunks);
unsigned bitmap_base_chunk = (SB_SECTOR + sb->sectors_per_block + sb->sectors_per_chunk - 1) >> sb->sectors_per_chunk_bits;
unsigned bitmap_chunks = sb->image.bitmap_blocks = bitmaps; // !!! chunksize same as blocksize
unsigned reserved = bitmap_base_chunk + bitmap_chunks + sb->image.journal_size; // !!! chunksize same as blocksize
unsigned sector = sb->image.bitmap_base = bitmap_base_chunk << sb->sectors_per_chunk_bits;
warn("snapshot store size: %Lu chunks (%Lu sectors)", chunks, chunks << sb->sectors_per_chunk_bits);
printf("Initializing %u bitmap blocks... ", bitmaps);
unsigned i;
for (i = 0; i < bitmaps; i++, sector += sb->sectors_per_block) {
struct buffer *buffer = getblk(sb->snapdev, sector, sb->blocksize);
printf("%Lx ", buffer->sector);
memset(buffer->data, 0, sb->blocksize);
/* Reserve bitmaps and superblock */
if (i == 0) {
unsigned i;
for (i = 0; i < reserved; i++)
set_bitmap_bit(buffer->data, i);
}
/* Suppress overrun allocation in partial last byte */
if (i == bitmaps - 1 && (chunks & 7))
buffer->data[(chunks >> 3) & (sb->blocksize - 1)] |= 0xff << (chunks & 7);
trace_off(dump_buffer(buffer, 0, 16);)
brelse_dirty(buffer);
}
printf("\n");
sb->image.freechunks = chunks - reserved;
sb->image.last_alloc = 0;
sb->image.journal_base = (bitmap_base_chunk + bitmap_chunks) << sb->sectors_per_chunk_bits;
}
static void free_chunk(struct superblock *sb, chunk_t chunk)
{
unsigned bitmap_shift = sb->image.blocksize_bits + 3, bitmap_mask = (1 << bitmap_shift ) - 1;
u64 bitmap_block = chunk >> bitmap_shift;
trace(printf("free chunk %Lx\n", chunk);)
struct buffer *buffer = snapread(sb, sb->image.bitmap_base + (bitmap_block << sb->sectors_per_block_bits));
if (!get_bitmap_bit(buffer->data, chunk & bitmap_mask)) {
warn("chunk %Lx already free!", (long long)chunk);
brelse(buffer);
return;
}
clear_bitmap_bit(buffer->data, chunk & bitmap_mask);
brelse_dirty(buffer);
sb->image.freechunks++;
set_sb_dirty(sb); // !!! optimize this away
}
static inline void free_block(struct superblock *sb, sector_t address)
{
free_chunk(sb, address >> sb->sectors_per_chunk_bits); // !!! assumes blocksize = chunksize
}
void grab_chunk(struct superblock *sb, chunk_t chunk) // just for testing
{
unsigned bitmap_shift = sb->image.blocksize_bits + 3, bitmap_mask = (1 << bitmap_shift ) - 1;
u64 bitmap_block = chunk >> bitmap_shift;
struct buffer *buffer = snapread(sb, sb->image.bitmap_base + (bitmap_block << sb->sectors_per_block_bits));
assert(!get_bitmap_bit(buffer->data, chunk & bitmap_mask));
set_bitmap_bit(buffer->data, chunk & bitmap_mask);
brelse_dirty(buffer);
}
chunk_t alloc_chunk_range(struct superblock *sb, chunk_t chunk, chunk_t range)
{
unsigned bitmap_shift = sb->image.blocksize_bits + 3, bitmap_mask = (1 << bitmap_shift ) - 1;
u64 blocknum = chunk >> bitmap_shift;
unsigned bit = chunk & 7, offset = (chunk & bitmap_mask) >> 3;
u64 length = (range + bit + 7) >> 3;
while (1) {
struct buffer *buffer = snapread(sb, sb->image.bitmap_base + (blocknum << sb->sectors_per_block_bits));
unsigned char c, *p = buffer->data + offset;
unsigned tail = sb->blocksize - offset, n = tail > length? length: tail;
trace_off(printf("search %u bytes of bitmap %Lx from offset %u\n", n, blocknum, offset);)
// dump_buffer(buffer, 4086, 10);
for (length -= n; n--; p++)
if ((c = *p) != 0xff) {
int i, bit;
trace_off(printf("found byte at offset %u of bitmap %Lx = %hhx\n", p - buffer->data, blocknum, c);)
for (i = 0, bit = 1;; i++, bit <<= 1)
if (!(c & bit)) {
chunk = i + ((p - buffer->data) << 3) + (blocknum << bitmap_shift);
assert(!get_bitmap_bit(buffer->data, chunk & bitmap_mask));
set_bitmap_bit(buffer->data, chunk & bitmap_mask);
brelse_dirty(buffer);
sb->image.freechunks--;
set_sb_dirty(sb); // !!! optimize this away
return chunk;
}
}
brelse(buffer);
if (!length)
return 0;
if (++blocknum == sb->image.bitmap_blocks)
blocknum = 0;
offset = 0;
trace_off(printf("go to bitmap %Lx\n", blocknum);)
}
}
static int delete_snapshot(struct superblock *sb, unsigned tag);
// !!! possible/practical to topsort away this forward ref?
static chunk_t alloc_chunk(struct superblock *sb)
{
chunk_t last = sb->image.last_alloc, total = sb->image.chunks, found;
do {
if ((found = alloc_chunk_range(sb, last, total - last)) ||
(found = alloc_chunk_range(sb, 0, last))) {
sb->image.last_alloc = found;
set_sb_dirty(sb); // !!! optimize this away
return (found);
}
int tag = sb->image.snaplist[0].tag;
warn("snapshot store full, release snapshot %i", tag);
if (delete_snapshot(sb, tag))
error("snapshot delete failed");
} while (sb->image.snapshots);
error("snapshot store full");
return -1;
}
/* Snapshot Store Allocation */
static sector_t alloc_block(struct superblock *sb)
{
return alloc_chunk(sb) << sb->sectors_per_chunk_bits; // !!! assume blocksize = chunksize
}
static u64 alloc_exception(struct superblock *sb)
{
return alloc_chunk(sb);
}
static struct buffer *new_block(struct superblock *sb)
{
// !!! handle alloc_block failure
return getblk(sb->snapdev, alloc_block(sb), sb->blocksize);
}
static struct buffer *new_leaf(struct superblock *sb)
{
trace(printf("New leaf\n");)
struct buffer *buffer = new_block(sb);
init_leaf(buffer2leaf(buffer), sb->blocksize);
set_buffer_dirty(buffer);
return buffer;
}
static struct buffer *new_node(struct superblock *sb)
{
trace(printf("New node\n");)
struct buffer *buffer = new_block(sb);
struct enode *node = buffer2node(buffer);
node->count = 0;
set_buffer_dirty(buffer);
return buffer;
}
/* BTree debug dump */
static void show_subtree(struct superblock *sb, struct enode *node, int levels, int indent)
{
int i;
printf("%*s", indent, "");
printf("%i nodes:\n", node->count);
for (i = 0; i < node->count; i++) {
struct buffer *buffer = snapread(sb, node->entries[i].sector);
if (i)
printf("pivot = %Lx\n", (long long)node->entries[i].key);
if (levels)
show_subtree(sb, buffer2node(buffer), levels - 1, indent + 3);
else {
printf("%*s", indent + 3, "");
show_leaf(buffer2leaf(buffer));
}
brelse(buffer);
}
}
static void show_tree(struct superblock *sb)
{
struct buffer *buffer = snapread(sb, sb->image.etree_root);
show_subtree(sb, buffer2node(buffer), sb->image.etree_levels - 1, 0);
brelse(buffer);
}
/* High Level BTree Editing */
/*
* BTree insertion is a little hairy, as expected. We keep track of the
* access path in a vector of etree_path elements, each of which holds
* a node buffer and a pointer into the buffer data giving the address at
* which the next buffer in the path was found, which is also where a new
* node will be inserted if necessary. If a leaf is split we may need to
* work all the way up from the bottom to the top of the path, splitting
* index nodes as well. If we split the top index node we need to add
* a new tree level. We have to keep track of which nodes were modified
* and keep track of refcounts of all buffers involved, which can be quite
* a few.
*
* Note that the first key of an index block is never accessed. This is
* because for a btree, there is always one more key than nodes in each
* index node. In other words, keys lie between node pointers. We will
* micro-optimize by placing the node count in the first key, which allows
* a node to contain an esthetically pleasing binary number of pointers.
* (Not done yet.)
*/
#define MAX_ETREE_DEPTH 6
struct etree_path { struct buffer *buffer; struct index_entry *pnext; };
static struct buffer *probe(struct superblock *sb, u64 chunk, struct etree_path *path)
{
unsigned i, levels = sb->image.etree_levels;
struct buffer *nodebuf = snapread(sb, sb->image.etree_root);
struct enode *node = buffer2node(nodebuf);
for (i = 0; i < levels; i++) {
struct index_entry *pnext = node->entries, *top = pnext + node->count;
while (++pnext < top)
if (pnext->key > chunk)
break;
path[i].buffer = nodebuf;
path[i].pnext = pnext;
nodebuf = snapread(sb, (pnext - 1)->sector);
node = (struct enode *)nodebuf->data;
}
assert(((struct eleaf *)nodebuf->data)->magic == 0x1eaf);
return nodebuf;
}
static void brelse_path(struct etree_path *path, unsigned levels)
{
unsigned i;
for (i = 0; i < levels; i++)
brelse(path[i].buffer);
}
static void show_tree_range(struct superblock *sb, chunk_t start, unsigned leaves)
{
int levels = sb->image.etree_levels, level = -1;
struct etree_path path[levels];
struct buffer *nodebuf;
struct enode *node;
struct buffer *leafbuf;
#if 1
leafbuf = probe(sb, start, path);
level = levels - 1;
nodebuf = path[level].buffer;
node = buffer2node(nodebuf);
goto start;
#endif
while (1) {
do {
level++;
nodebuf = snapread(sb, level? path[level - 1].pnext++->sector: sb->image.etree_root);
node = buffer2node(nodebuf);
path[level].buffer = nodebuf;
path[level].pnext = node->entries;
trace(printf("push to level %i, %i nodes\n", level, node->count);)
} while (level < levels - 1);
trace(printf("do %i leaf nodes level = %i\n", node->count, level);)
while (path[level].pnext < node->entries + node->count) {
leafbuf = snapread(sb, path[level].pnext++->sector);
start: show_leaf(buffer2leaf(leafbuf));
brelse(leafbuf);
if (!--leaves) {
brelse_path(path, level + 1);
return;
}
}
do {
brelse(nodebuf);
if (!level)
return;
nodebuf = path[--level].buffer;
node = buffer2node(nodebuf);
trace(printf("pop to level %i, %i of %i nodes\n", level, path[level].pnext - node->entries, node->count);)
} while (path[level].pnext == node->entries + node->count);
};
}
static void insert_child(struct enode *node, struct index_entry *p, sector_t child, u64 childkey)
{
memmove(p + 1, p, (char *)(&node->entries[0] + node->count) - (char *)p);
p->sector = child;
p->key = childkey;
node->count++;
}
static void add_exception_to_tree(struct superblock *sb, struct buffer *leafbuf, u64 target, u64 exception, int snapnum, struct etree_path path[], unsigned levels)
{
if (!add_exception_to_leaf(buffer2leaf(leafbuf), target, exception, snapnum, sb->snapmask)) {
brelse_dirty(leafbuf);
return;
}
trace(warn("add leaf");)
struct buffer *childbuf = new_leaf(sb);
u64 childkey = split_leaf(buffer2leaf(leafbuf), buffer2leaf(childbuf));
sector_t childsector = childbuf->sector;
if (add_exception_to_leaf(target < childkey? buffer2leaf(leafbuf): buffer2leaf(childbuf), target, exception, snapnum, sb->snapmask))
error("can't happen");
brelse_dirty(leafbuf);
brelse_dirty(childbuf);
while (levels--) {
struct index_entry *pnext = path[levels].pnext;
struct buffer *parentbuf = path[levels].buffer;
struct enode *parent = buffer2node(parentbuf);
if (parent->count < sb->blocks_per_node) {
insert_child(parent, pnext, childsector, childkey);
set_buffer_dirty(parentbuf);
return;
}
unsigned half = parent->count / 2;
u64 newkey = parent->entries[half].key;
struct buffer *newbuf = new_node(sb); // !!! handle error
struct enode *newnode = buffer2node(newbuf);
newnode->count = parent->count - half;
memcpy(&newnode->entries[0], &parent->entries[half], newnode->count * sizeof(struct index_entry));
parent->count = half;
if (pnext > &parent->entries[half]) {
pnext = pnext - &parent->entries[half] + newnode->entries;
set_buffer_dirty(parentbuf);
parentbuf = newbuf;
parent = newnode;
} else set_buffer_dirty(newbuf);
insert_child(parent, pnext, childsector, childkey);
set_buffer_dirty(parentbuf);
childkey = newkey;
childsector = newbuf->sector;
brelse(newbuf);
}
trace(printf("add tree level\n");)
struct buffer *newrootbuf = new_node(sb); // !!! handle error
struct enode *newroot = buffer2node(newrootbuf);
newroot->count = 2;
newroot->entries[0].sector = sb->image.etree_root;
newroot->entries[1].key = childkey;
newroot->entries[1].sector = childsector;
sb->image.etree_root = newrootbuf->sector;
sb->image.etree_levels++;
set_sb_dirty(sb);
brelse_dirty(newrootbuf);
}
#define chunk_highbit ((sizeof(chunk_t) * 8) - 1)
static int finish_copyout(struct superblock *sb)
{
if (sb->copy_chunks) {
int is_snap = sb->source_chunk >> chunk_highbit;
chunk_t source = sb->source_chunk & ~(1ULL << chunk_highbit);
unsigned size = sb->copy_chunks << sb->image.chunksize_bits;
trace(warn("copy %u %schunks from %Lx to %Lx", sb->copy_chunks, is_snap? "snapshot ": "", source, sb->dest_exception);)
assert(size <= sb->copybuf_size);
pread(is_snap? sb->snapdev: sb->orgdev, sb->copybuf, size, source << sb->image.chunksize_bits); // 64 bit!!!
pwrite(sb->snapdev, sb->copybuf, size, sb->dest_exception << sb->image.chunksize_bits); // 64 bit!!!
sb->copy_chunks = 0;
}
return 0;
}
static int copyout(struct superblock *sb, chunk_t chunk, chunk_t exception)
{
#if 1
if (sb->source_chunk + sb->copy_chunks == chunk &&
sb->dest_exception + sb->copy_chunks == exception &&
sb->copy_chunks < sb->copybuf_size >> sb->image.chunksize_bits) {
sb->copy_chunks++;
return 0;
}
finish_copyout(sb);
sb->copy_chunks = 1;
sb->source_chunk = chunk;
sb->dest_exception = exception;
#else
int is_snap = sb->source_chunk >> chunk_highbit;
chunk_t source = chunk & ~((1ULL << chunk_highbit) - 1);
pread(is_snap? sb->snapdev: sb->orgdev, sb->copybuf, sb->chunksize, source << sb->image.chunksize_bits); // 64 bit!!!
pwrite(sb->snapdev, sb->copybuf, sb->chunksize, exception << sb->image.chunksize_bits); // 64 bit!!!
#endif
return 0;
}
/*
* This is the bit that does all the work. It's rather arbitrarily
* factored into a probe and test part, then an exception add part,
* called only if an exception for a given chunk isn't already present
* in the Btree. This factoring will change a few more times yet as
* the code gets more asynchronous and multi-threaded.
*/
static chunk_t make_unique(struct superblock *sb, chunk_t chunk, int snapnum)
{
unsigned levels = sb->image.etree_levels;
struct etree_path path[levels + 1];
struct buffer *leafbuf = probe(sb, chunk, path);
chunk_t exception = 0;
trace(warn("chunk %Lx, snapnum %i", chunk, snapnum));
if (snapnum == -1?
origin_chunk_unique(buffer2leaf(leafbuf), chunk, sb->snapmask):
snapshot_chunk_unique(buffer2leaf(leafbuf), chunk, snapnum, &exception))
{
trace(warn("chunk %Lx already unique in snapnum %i", chunk, snapnum);)
brelse(leafbuf);
goto out;
}
u64 newex = alloc_exception(sb);
if (newex == -1) {
brelse(leafbuf); // !!! maybe add_exception_to_tree should not do this
goto out;
}
// if (snapnum == -1)
copyout(sb, exception? (exception | (1ULL << chunk_highbit)): chunk, newex);
add_exception_to_tree(sb, leafbuf, chunk, newex, snapnum, path, levels);
exception = newex;
out:
brelse_path(path, levels);
return exception;
}
static int test_unique(struct superblock *sb, chunk_t chunk, int snapnum, chunk_t *exception)
{
unsigned levels = sb->image.etree_levels;
struct etree_path path[levels + 1];
struct buffer *leafbuf = probe(sb, chunk, path);
trace(warn("chunk %Lx, snapnum %i", chunk, snapnum));
int result = snapnum == -1?
origin_chunk_unique(buffer2leaf(leafbuf), chunk, sb->snapmask):
snapshot_chunk_unique(buffer2leaf(leafbuf), chunk, snapnum, exception);
brelse(leafbuf);
brelse_path(path, levels);
return result;
}
/* Snapshot Store Superblock handling */
static u64 calc_snapmask(struct superblock *sb)
{
u64 mask = 0;
int i;
for (i = 0; i < sb->image.snapshots; i++)
mask |= 1ULL << sb->image.snaplist[i].bit;
return mask;
}
static int tag_snapnum(struct superblock *sb, unsigned tag)
{
unsigned i, n = sb->image.snapshots;
for (i = 0; i < n; i++)
if (sb->image.snaplist[i].tag == tag)
return sb->image.snaplist[i].bit;
return -1;
}
static int snapnum_tag(struct superblock *sb, unsigned bit)
{
unsigned i, n = sb->image.snapshots;
for (i = 0; i < n; i++)
if (sb->image.snaplist[i].bit == bit)
return sb->image.snaplist[i].tag;
return -1;
}
static int create_snapshot(struct superblock *sb, unsigned snaptag)
{
unsigned i, snapshots = sb->image.snapshots;
struct snapshot *snapshot;
/* Check tag not already used */
for (i = 0; i < snapshots; i++)
if (sb->image.snaplist[i].tag == snaptag)
return -1;
/* Find available snapshot bit */
for (i = 0; i < MAX_SNAPSHOTS; i++)
if (!(sb->snapmask & (1ULL << i)))
goto create;
return -EFULL;
create:
trace_on(printf("Create snapshot %i (internal %i)\n", snaptag, i);)
snapshot = sb->image.snaplist + sb->image.snapshots++;
*snapshot = (struct snapshot){ .tag = snaptag, .bit = i, .ctime = time(NULL) };
sb->snapmask |= (1ULL << i);
set_sb_dirty(sb);
return i;
};
/*
* delete_snapshot: remove all exceptions from a given snapshot from a leaf
* working from top to bottom of the exception list clearing snapshot bits
* and packing the nonzero exceptions into the top of the block. Then work
* from bottom to top in the directory map packing nonempty entries into the
* bottom of the map.
*/
static int delete_snapshots_from_leaf(struct superblock *sb, struct eleaf *leaf, u64 snapmask, unsigned max_dirty)
{
struct exception *p = emap(leaf, leaf->count), *dest = p;
struct etree_map *pmap, *dmap;
unsigned i, any = 0;
/* Scan top to bottom clearing snapshot bit and moving
* non-zero entries to top of block */
for (i = leaf->count; i--;) {
while (p != emap(leaf, i)) {
if (max_dirty >= dirty_buffer_count)
snapmask = 0;
u64 share = (--p)->share;
any |= share & snapmask;
if ((share &= ~snapmask))
*--dest = *p;
else
free_chunk(sb, p->chunk);
}
leaf->map[i].offset = (char *)dest - (char *)leaf;
}
/* Remove empties from map */
dmap = pmap = &leaf->map[0];
for (i = 0; i < leaf->count; i++, pmap++)
if (pmap->offset != (pmap + 1)->offset)
*dmap++ = *pmap;
dmap->offset = pmap->offset;
dmap->rchunk = 0; // tidy up
leaf->count = dmap - &leaf->map[0];
return !!any;
}
static void delete_snapshots_from_tree(struct superblock *sb, u64 snapmask)
{
int levels = sb->image.etree_levels, level = -1;
struct etree_path path[levels];
struct buffer *nodebuf;
struct enode *node;
trace_on(printf("delete snapshot mask %Lx\n", snapmask);)
while (1) {
do {
level++;
nodebuf = snapread(sb, level? path[level - 1].pnext++->sector: sb->image.etree_root);
node = buffer2node(nodebuf);
path[level].buffer = nodebuf;
path[level].pnext = node->entries;
trace(printf("push to level %i, %i nodes\n", level, node->count);)
} while (level < levels - 1);
trace(printf("do %i leaf nodes\n", node->count);)
while (path[level].pnext < node->entries + node->count) {
struct buffer *leafbuf = snapread(sb, path[level].pnext++->sector);
trace_off(printf("process leaf %Lx\n", leafbuf->sector);)
delete_snapshots_from_leaf(sb, buffer2leaf(leafbuf), snapmask, -1);
brelse(leafbuf);
}
do {
brelse(nodebuf);
if (!level)
return;
nodebuf = path[--level].buffer;
node = buffer2node(nodebuf);
trace(printf("pop to level %i, %i of %i nodes\n", level, path[level].pnext - node->entries, node->count);)
} while (path[level].pnext == node->entries + node->count);
};
}
/*
* Delete algorithm (flesh this out)
*
* reached the end of an index block:
* try to merge with an index block in hold[]
* if can't merge then maybe can rebalance
* if can't merge then release the block in hold[] and move this block to hold[]
* can't merge if there's no block in hold[] or can't fit two together
* if can merge
* release and free this index block and
* delete from parent:
* if parent count zero, the grantparent key is going to be deleted, updating the pivot
* otherwise parent's deleted key becomes new pivot
*/
static inline struct enode *path_node(struct etree_path path[], int level)
{
return buffer2node(path[level].buffer);
}
static inline int finished_level(struct etree_path path[], int level)
{
struct enode *node = path_node(path, level);
return path[level].pnext == node->entries + node->count;
}
static void remove_index(struct etree_path path[], int level)
{
struct enode *node = path_node(path, level);
chunk_t pivot = (path[level].pnext)->key; // !!! out of bounds for delete of last from full index
int count = node->count, i;
// stomps the node count (if 0th key holds count)
memmove(path[level].pnext - 1, path[level].pnext,
(char *)&node->entries[count] - (char *)path[level].pnext);
node->count = count - 1;
--(path[level].pnext);
set_buffer_dirty(path[level].buffer);
// no pivot for last entry
if (path[level].pnext == node->entries + node->count)
return;
// climb up to common parent and set pivot to deleted key
// what if index is now empty? (no deleted key)
// then some key above is going to be deleted and used to set pivot
if (path[level].pnext == node->entries && level) {
for (i = level - 1; path[i].pnext - 1 == path_node(path, i)->entries; i--)
if (!i)
return;
(path[i].pnext - 1)->key = pivot;
set_buffer_dirty(path[i].buffer);
}
}
static void brelse_free(struct superblock *sb, struct buffer *buffer)
{
brelse(buffer);
if (buffer->count) {
warn("free block %Lx still in use!", (long long)buffer->sector);
return;
}
free_block(sb, buffer->sector);
evict_buffer(buffer);
}
static chunk_t delete_tree_range(struct superblock *sb, u64 snapmask, chunk_t resume, unsigned max_dirty)
{
int levels = sb->image.etree_levels, level = levels - 1;
struct etree_path path[levels], hold[levels];
struct buffer *leafbuf, *prevleaf = NULL;
unsigned i;
for (i = 0; i < levels; i++) // can be initializer if not dynamic array (change it?)
hold[i] = (struct etree_path){ };
leafbuf = probe(sb, resume, path);
while (1) { /* in-order leaf walk */
trace_off(show_leaf(buffer2leaf(leafbuf));)
// should pass in and act on max_dirty...
if (delete_snapshots_from_leaf(sb, buffer2leaf(leafbuf), snapmask, max_dirty))
set_buffer_dirty(leafbuf);
if (prevleaf) { /* try to merge this leaf with prev */
struct eleaf *this = buffer2leaf(leafbuf);
struct eleaf *prev = buffer2leaf(prevleaf);
trace_off(warn("check leaf %p against %p", leafbuf, prevleaf);)
trace_off(warn("need = %i, free = %i", leaf_payload(this), leaf_freespace(prev));)
if (leaf_payload(this) <= leaf_freespace(prev)) {
trace_off(warn(">>> can merge leaf %p into leaf %p", leafbuf, prevleaf);)
merge_leaves(prev, this);
remove_index(path, level);
set_buffer_dirty(prevleaf);
brelse_free(sb, leafbuf);
goto keep_prev_leaf;
}
brelse(prevleaf);
}
prevleaf = leafbuf;
keep_prev_leaf:
if (finished_level(path, level)) {
do { /* pop and try to merge finished nodes */
if (hold[level].buffer) {
assert(level); /* root node can't have any prev */
struct enode *this = path_node(path, level);
struct enode *prev = path_node(hold, level);
trace_off(warn("check node %p against %p", this, prev);)
trace_off(warn("this count = %i prev count = %i", this->count, prev->count);)
if (this->count <= sb->blocks_per_node - prev->count) {
trace_off(warn(">>> can merge node %p into node %p", this, prev);)
merge_nodes(prev, this);
remove_index(path, level - 1);
set_buffer_dirty(hold[level].buffer);
brelse_free(sb, path[level].buffer);
goto keep_prev_node;
}
brelse(hold[level].buffer);
}
hold[level].buffer = path[level].buffer;
keep_prev_node:
if (!level) { /* remove levels if possible */
while (levels > 1 && path_node(hold, 0)->count == 1) {
trace_off(warn("drop btree level");)
sb->image.etree_root = hold[1].buffer->sector;
brelse_free(sb, hold[0].buffer);
levels = --sb->image.etree_levels;
memcpy(hold, hold + 1, levels * sizeof(hold[0]));
set_sb_dirty(sb);
}
brelse(prevleaf);
brelse_path(hold, levels);
return 0;
}
level--;
trace_off(printf("pop to level %i, %i of %i nodes\n", level, path[level].pnext - path_node(path, level)->entries, path_node(path, level)->count);)
} while (finished_level(path, level));
do { /* push back down to leaf level */
struct buffer *nodebuf = snapread(sb, path[level++].pnext++->sector);
path[level].buffer = nodebuf;
path[level].pnext = buffer2node(nodebuf)->entries;
trace_off(printf("push to level %i, %i nodes\n", level, path_node(path, level)->count);)
} while (level < levels - 1);
}
/* Exit if dirty buffer count above threshold */
// might incur a few extra index reads but oh well
if (dirty_buffer_count >= max_dirty) {
brelse(prevleaf);
brelse_path(path, levels);
for (i = 0; i < levels; i++)
if (hold[i].buffer)
brelse(hold[i].buffer);
// sb->delete_progress = ???;
return 1;
}
leafbuf = snapread(sb, path[level].pnext++->sector);
};
}
static int delete_snapshot(struct superblock *sb, unsigned tag)
{
struct snapshot *snapshot;
unsigned i, bit;
for (i = 0; i < sb->image.snapshots; i++)
if (sb->image.snaplist[i].tag == tag)
goto delete;
return -1;
delete:
snapshot = sb->image.snaplist + i;
bit = snapshot->bit;
trace_on(printf("Delete snapshot %i (internal %i)\n", tag, bit);)
memmove(snapshot, snapshot + 1, (char *)(sb->image.snaplist + --sb->image.snapshots) - (char *)snapshot);
sb->snapmask &= ~(1ULL << bit);
delete_snapshots_from_tree(sb, 1ULL << bit);
set_sb_dirty(sb);
return 0;
};
static void show_snapshots(struct superblock *sb)
{
unsigned snapnum, snapshots = sb->image.snapshots;
printf("%u snapshots\n", snapshots);
for (snapnum = 0; snapnum < snapshots; snapnum++) {
struct snapshot *snapshot = sb->image.snaplist + snapnum;
printf("snapshot %u tag %u prio %i created %x\n",
snapshot->bit,
snapshot->tag,
snapshot->prio,
snapshot->ctime);
}
};
/* Lock snapshot reads against origin writes */
static void reply(fd_t sock, struct messagebuf *message)
{
trace(warn("%x/%u", message->head.code, message->head.length);)
writepipe(sock, &message->head, message->head.length + sizeof(message->head));
}
struct client
{
u64 id;
fd_t sock;
int snap;
};
struct pending
{
unsigned holdcount;
struct client *client;
struct messagebuf message;
};
struct snaplock_wait
{
struct pending *pending;
struct snaplock_wait *next;
};
struct snaplock_hold
{
struct client *client;
struct snaplock_hold *next;
};
struct snaplock
{
struct snaplock_wait *waitlist;
struct snaplock_hold *holdlist;
struct snaplock *next;
chunk_t chunk;
};
struct snaplock *new_snaplock(struct superblock *sb)
{
return malloc(sizeof(struct snaplock));
}
struct snaplock_wait *new_snaplock_wait(struct superblock *sb)
{
return malloc(sizeof(struct snaplock_wait));
}
struct snaplock_hold *new_snaplock_hold(struct superblock *sb)
{
return malloc(sizeof(struct snaplock_hold));
}
void free_snaplock(struct superblock *sb, struct snaplock *p)
{
free(p);
}
void free_snaplock_hold(struct superblock *sb, struct snaplock_hold *p)
{
free(p);
}
void free_snaplock_wait(struct superblock *sb, struct snaplock_wait *p)
{
free(p);
}
unsigned snaplock_hash(struct superblock *sb, chunk_t chunk)
{
return ((u32)(chunk * 3498734713U)) >> (32 - sb->snaplock_hash_bits);
}
struct snaplock *find_snaplock(struct snaplock *list, chunk_t chunk)
{
for (; list; list = list->next)
if (list->chunk == chunk)
return list;
return NULL;
}
void waitfor_chunk(struct superblock *sb, chunk_t chunk, struct pending **pending)
{
struct snaplock *lock;
if ((lock = find_snaplock(sb->snaplocks[snaplock_hash(sb, chunk)], chunk))) {
if (!*pending) {
// arguably we should know the client and fill it in here
*pending = calloc(1, sizeof(struct pending));
(*pending)->holdcount = 1;
}
struct snaplock_wait *wait = new_snaplock_wait(sb);
wait->pending = *pending;
wait->next = lock->waitlist;
lock->waitlist = wait;
(*pending)->holdcount++;
}
}
void readlock_chunk(struct superblock *sb, chunk_t chunk, struct client *client)
{
struct snaplock **bucket = &sb->snaplocks[snaplock_hash(sb, chunk)];
struct snaplock *lock;
if (!(lock = find_snaplock(*bucket, chunk))) {
lock = new_snaplock(sb);
*lock = (struct snaplock){ .chunk = chunk, .next = *bucket };
*bucket = lock;
}
struct snaplock_hold *hold = new_snaplock_hold(sb);
hold->client = client;
hold->next = lock->holdlist;
lock->holdlist = hold;
}
struct snaplock *release_lock(struct superblock *sb, struct snaplock *lock, struct client *client)
{
struct snaplock *ret = lock;
struct snaplock_hold **holdp = &lock->holdlist;
while (*holdp && (*holdp)->client != client)
holdp = &(*holdp)->next;
if (!*holdp) {
trace_on(printf("chunk %Lx holder %Lu not found\n", lock->chunk, client->id);)
return NULL;
}
/* Delete and free holder record */
struct snaplock_hold *next = (*holdp)->next;
free_snaplock_hold(sb, *holdp);
*holdp = next;
if (lock->holdlist)
return ret;
/* Release and delete waiters, delete lock */
struct snaplock_wait *list = lock->waitlist;
while (list) {
struct snaplock_wait *next = list->next;
assert(list->pending->holdcount);
if (!--(list->pending->holdcount)) {
struct pending *pending = list->pending;
reply(pending->client->sock, &pending->message);
free(pending);
}
free_snaplock_wait(sb, list);
list = next;
}
ret = lock->next;
free_snaplock(sb, lock);
return ret;
}
int release_chunk(struct superblock *sb, chunk_t chunk, struct client *client)
{
trace(printf("release %Lx\n", chunk);)
struct snaplock **lockp = &sb->snaplocks[snaplock_hash(sb, chunk)];
/* Find pointer to lock record */
while (*lockp && (*lockp)->chunk != chunk)
lockp = &(*lockp)->next;
struct snaplock *next, *lock = *lockp;
if (!lock) {
trace_on(printf("chunk %Lx not locked\n", chunk);)
return -1;
}
next = release_lock(sb, lock, client);
if (!next)
return -2;
*lockp = next;
return 0;
}
void show_locks(struct superblock *sb)
{
unsigned n = 0, i;
for (i = 0; i < (1 << sb->snaplock_hash_bits); i++) {
struct snaplock *lock = sb->snaplocks[i];
if (!lock)
continue;
if (!n) printf("Locks:\n");
printf("[%03u] ", i);
do {
printf("chunk %Lx ", lock->chunk);
struct snaplock_hold *hold = lock->holdlist;
for (; hold; hold = hold->next)
printf("held by client %Lu ", hold->client->id);
struct snaplock_wait *wait = lock->waitlist;
for (; wait; wait = wait->next)
printf("wait [%02hx/%u] ", snaplock_hash(sb, (u32)wait->pending), wait->pending->holdcount);
} while ((lock = lock->next));
printf("\n");
n++;
}
if (!n) printf("-- no locks --\n");
}
/* Build up a response as a list of chunk ranges */
struct addto
{
unsigned count;
chunk_t firstchunk;
chunk_t nextchunk;
struct rwmessage *reply;
shortcount *countp;
chunk_t *top;
char *lim;
};
void check_response_full(struct addto *r, unsigned bytes)
{
if ((char *)r->top < r->lim - bytes)
return;
error("Need realloc");
}
void addto_response(struct addto *r, chunk_t chunk)
{
if (chunk != r->nextchunk) {
if (r->top) {
trace_off(warn("finish old range\n");)
*(r->countp) = (r->nextchunk - r->firstchunk);
} else {
trace_off(warn("alloc new reply");)
r->reply = (void *) malloc(sizeof(struct messagebuf));
r->top = (chunk_t *)(((char *)r->reply) + sizeof(struct head) + offsetof(struct rw_request, ranges));
r->lim = ((char *)r->reply) + maxbody;
r->count++;
}
trace_off(warn("start new range");)
check_response_full(r, 2*sizeof(chunk_t));
r->firstchunk = *(r->top)++ = chunk;
r->countp = (shortcount *)r->top;
r->top = (chunk_t *)(((shortcount *)r->top) + 1);
}
r->nextchunk = chunk + 1;
}
int finish_reply_(struct addto *r, unsigned code, unsigned id)
{
if (!r->countp)
return 0;
*(r->countp) = (r->nextchunk - r->firstchunk);
r->reply->head.code = code;
r->reply->head.length = (char *)r->top - (char *)r->reply - sizeof(struct head);
r->reply->body.id = id;
r->reply->body.count = r->count;
return 1;
}
void finish_reply(int sock, struct addto *r, unsigned code, unsigned id)
{
if (finish_reply_(r, code, id))
reply(sock, (struct messagebuf *)r->reply);
free(r->reply);
}
/* Initialization, State load/save */
void setup_sb(struct superblock *sb)
{
unsigned blocksize_bits = sb->image.blocksize_bits;
unsigned chunksize_bits = sb->image.blocksize_bits;
sb->blocksize = 1 << blocksize_bits;
sb->chunksize = 1 << chunksize_bits,
sb->sectors_per_block_bits = blocksize_bits - SECTOR_BITS;
sb->sectors_per_chunk_bits = chunksize_bits - SECTOR_BITS;
sb->blocks_per_node = (sb->blocksize - offsetof(struct enode, entries)) / sizeof(struct index_entry);
#ifdef BUSHY
sb->blocks_per_node = 10;
#endif
sb->copybuf = malloc_aligned(sb->copybuf_size = (32 * sb->chunksize), 4096); // !!! check failed
sb->sectors_per_block = 1 << sb->sectors_per_block_bits;
sb->sectors_per_chunk = 1 << sb->sectors_per_chunk_bits;
sb->snapmask = 0;
sb->flags = 0;
sb->max_commit_blocks = (sb->blocksize - sizeof(struct commit_block)) / sizeof(sector_t);
unsigned snaplock_hash_bits = 8;
sb->snaplock_hash_bits = snaplock_hash_bits;
sb->snaplocks = (struct snaplock **)calloc(1 << snaplock_hash_bits, sizeof(struct snaplock *));
}
void load_sb(struct superblock *sb)
{
struct buffer *buffer = bread(sb->snapdev, SB_SECTOR, SB_SIZE);
memcpy(&sb->image, buffer->data, sizeof(sb->image));
assert(!memcmp(sb->image.magic, SB_MAGIC, sizeof(sb->image.magic)));
brelse(buffer);
setup_sb(sb);
sb->snapmask = calc_snapmask(sb);
trace_on(printf("Active snapshot mask: %016llx\n", sb->snapmask);)
}
void save_sb(struct superblock *sb)
{
if (sb->flags & SB_DIRTY) {
struct buffer *buffer = getblk(sb->snapdev, SB_SECTOR, SB_SIZE);
memcpy(buffer->data, &sb->image, sizeof(sb->image));
write_buffer(buffer);
brelse(buffer);
sb->flags &= ~SB_DIRTY;
}
}
void save_state(struct superblock *sb)
{
flush_buffers();
save_sb(sb);
}
/*
* This source compiles either the snapshot server or the snapshot store setup
* utility, depending on whether the macro variable CREATE is defined.
*
* I'll leave all the testing hooks lying around in the main routine for now,
* since the low level components still tend to break every now and then and
* require further unit testing.
*/
int init_snapstore(struct superblock *sb)
{
int i, error;
unsigned sectors_per_block_bits = 3;
sb->image = (struct disksuper){ .magic = SB_MAGIC };
sb->image.etree_levels = 1,
sb->image.blocksize_bits = SECTOR_BITS + sectors_per_block_bits;
sb->image.chunksize_bits = sb->image.blocksize_bits; // !!! just for now
setup_sb(sb);
u64 size;
if ((error = fd_size(sb->snapdev, &size)))
error("Error %i: %s determining snapshot store size", error, strerror(error));
sb->image.chunks = size >> sb->image.chunksize_bits;
if ((error = fd_size(sb->orgdev, &size)))
error("Error %i: %s determining origin volume size", errno, strerror(errno));
sb->image.orgchunks = size >> sb->image.chunksize_bits;
sb->image.journal_size = 100;
#ifdef TEST_JOURNAL
sb->image.journal_size = 5;
#endif
sb->image.journal_next = 0;
sb->image.sequence = sb->image.journal_size;
init_allocation(sb);
set_sb_dirty(sb);
for (i = 0; i < sb->image.journal_size; i++) {
struct buffer *buffer = jgetblk(sb, i);
struct commit_block *commit = (struct commit_block *)buffer->data;
*commit = (struct commit_block){ .magic = JMAGIC, .sequence = i };
#ifdef TEST_JOURNAL
commit->sequence = (i + 3) % 5;
#endif
commit->checksum = -checksum_block(sb, (void *)commit);
brelse_dirty(buffer);
}
#ifdef TEST_JOURNAL
show_journal(sb);
show_tree(sb);
flush_buffers();
recover_journal(sb);
show_buffers();
#endif
#if 0
printf("chunk = %Lx\n", alloc_chunk_range(sb, sb->image.chunks - 1, 1));
// struct buffer *buffer = snapread(sb, sb->image.bitmap_base + 3 * 8);
// dump_buffer(buffer, 4090, 6);
return 0;
#endif
#if 0
grab_chunk(sb, 32769);
struct buffer *buffer = snapread(sb, sb->image.bitmap_base + 8);
printf("sector %Lx\n", buffer->sector);
free_chunk(sb, 32769);
return 0;
#endif
struct buffer *leafbuf = new_leaf(sb);
struct buffer *rootbuf = new_node(sb);
buffer2node(rootbuf)->count = 1;
buffer2node(rootbuf)->entries[0].sector = leafbuf->sector;
sb->image.etree_root = rootbuf->sector;
#if 0
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
// free_chunk(sb, 23);
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
printf("chunk = %Lx\n", alloc_chunk(sb));
return 0;
#endif
brelse_dirty(rootbuf);
brelse_dirty(leafbuf);
#if 0
struct buffer *leafbuf1 = new_leaf(sb);
struct buffer *leafbuf2 = new_leaf(sb);
struct eleaf *leaf1 = buffer2leaf(leafbuf1);
struct eleaf *leaf2 = buffer2leaf(leafbuf2);
init_leaf(leaf1, 256);
init_leaf(leaf2, 256);
add_exception_to_leaf(leaf1, 0x111, 0x11, 0, 3);
add_exception_to_leaf(leaf2, 0x222, 0x11, 1, 3);
add_exception_to_leaf(leaf2, 0x333, 0x33, 1, 3);
show_leaf(leaf1);
show_leaf(leaf2);
merge_leaves(leaf1, leaf2);
show_leaf(leaf1);
return 0;
#endif
#ifdef TEST_JOURNAL
show_buffers();
show_dirty_buffers();
commit_transaction(sb);
evict_buffers();
show_journal(sb);
show_tree(sb);
recover_journal(sb);
evict_buffers();
show_tree(sb);
#endif
save_state(sb);
return 0;
}
// Expand snapshot store:
// Calculate num bitmap blocks for new size
// Copy bitmap blocks to current top
// Clear new bitmap blocks
// Reserve new bitmap blocks
// Clear remainder bits in old last bitmap byte
// Set remainder bits in new last bitmap byte
// Set new bitmap base and chunks count
static void expand_snapstore(struct superblock *sb, u64 newchunks)
{
u64 oldchunks = sb->image.chunks;
unsigned oldbitmaps = sb->image.bitmap_blocks;
unsigned newbitmaps = calc_bitmap_blocks(sb, newchunks);
unsigned blocksize = sb->blocksize;
unsigned blockshift = sb->image.blocksize_bits;
u64 oldbase = sb->image.bitmap_base << SECTOR_BITS;
u64 newbase = sb->image.bitmap_base << SECTOR_BITS;
int i;
for (i = 0; i < oldbitmaps; i++) {
// do it one block at a time for now !!! sucks
// maybe should do copy with bread/write?
pread(sb->snapdev, sb->copybuf, blocksize, oldbase + (i << blockshift)); // 64 bit!!!
pwrite(sb->snapdev, sb->copybuf, blocksize, newbase + (i << blockshift)); // 64 bit!!!
}
if ((oldchunks & 7)) {
sector_t sector = (oldbase >> SECTOR_BITS) + ((oldbitmaps - 1) << sb->sectors_per_block_bits);
struct buffer *buffer = getblk(sb->snapdev, sector, blocksize);
buffer->data[(oldchunks >> 3) & (blocksize - 1)] &= ~(0xff << (oldchunks & 7));
brelse_dirty(buffer);
}
for (i = oldbitmaps; i < newbitmaps; i++) {
struct buffer *buffer = getblk(sb->snapdev, newbase >> SECTOR_BITS, blocksize);
memset(buffer->data, 0, sb->blocksize);
/* Suppress overrun allocation in partial last byte */
if (i == newbitmaps - 1 && (newchunks & 7))
buffer->data[(newchunks >> 3) & (blocksize - 1)] |= 0xff << (newchunks & 7);
brelse_dirty(buffer);
}
for (i = 0; i < newbitmaps; i++) {
grab_chunk(sb, (newbase >> blockshift) + i); // !!! assume blocksize = chunksize
}
sb->image.bitmap_base = newbase >> SECTOR_BITS;
sb->image.chunks = newchunks;
save_state(sb);
}
int client_locks(struct superblock *sb, struct client *client, int check)
{
int i;
for (i = 0; i < (1 << sb->snaplock_hash_bits); i++) {
struct snaplock **lockp = &sb->snaplocks[i];
while (*lockp) {
struct snaplock_hold *hold;
for (hold = (*lockp)->holdlist; hold; hold = hold->next)
if (hold->client == client) {
if (check)
return 1;
*lockp = release_lock(sb, *lockp, client);
goto next;
}
lockp = &(*lockp)->next;
next:
continue;
}
}
return 0;
}
#define check_client_locks(x, y) client_locks(x, y, 1)
#define free_client_locks(x, y) client_locks(x, y, 0)
/*
* Responses to IO requests take two quite different paths through the
* machinery:
*
* - Origin write requests are just sent back with their message
* code changed, unless they have to wait for a snapshot read
* lock in which case the incoming buffer is copied and the
* response takes a kafkaesque journey through the read locking
* beaurocracy.
*
* - Responses to snapshot read or write requests have to be built
* up painstakingly in allocated buffers, keeping a lot of state
* around so that they end up with a minimum number of contiguous
* chunk ranges. Once complete they can always be sent
* immediately.
*
* To mess things up further, snapshot read requests can return both
* a list of origin ranges and a list of snapshot store ranges. In
* the latter case the specific snapshot store chunks in each logical
* range are also returned, because they can (normally will) be
* discontiguous. This goes back to the client in two separate
* messages, on the theory that the client will find it nice to be
* able to process the origin read ranges and snapshot read chunks
* separately. We'll see how good an idea that is.
*
* The implementation ends up looking nice and tidy, but appearances
* can be deceiving.
*/
int incoming(struct superblock *sb, struct client *client)
{
struct messagebuf message;
unsigned sock = client->sock;
int i, j, err;
if ((err = readpipe(sock, &message.head, sizeof(message.head))))
goto pipe_error;
trace(warn("%x/%u", message.head.code, message.head.length);)
if (message.head.length > maxbody)
goto message_too_long;
if ((err = readpipe(sock, &message.body, message.head.length)))
goto pipe_error;
switch (message.head.code) {
case QUERY_WRITE:
if (client->snap == -1) {
struct pending *pending = NULL;
struct rw_request *body = (struct rw_request *)message.body;
struct chunk_range *p = body->ranges;
chunk_t chunk;
if (message.head.length < sizeof(*body))
goto message_too_short;
trace(printf("origin write query, %u ranges\n", body->count);)
for (i = 0; i < body->count; i++, p++)
for (j = 0, chunk = p->chunk; j < p->chunks; j++, chunk++) {
chunk_t exception = make_unique(sb, chunk, -1);
if (exception == -1)
error("snapshot store full"); // !!! don't abort, send error
if (exception)
waitfor_chunk(sb, chunk, &pending);
}
finish_copyout(sb);
commit_transaction(sb);
message.head.code = REPLY_ORIGIN_WRITE;
if (pending) {
pending->client = client;
memcpy(&pending->message, &message, message.head.length + sizeof(struct head));
pending->holdcount--;
break;
}
reply(sock, &message);
break;
}
struct rw_request *body = (struct rw_request *)message.body;
if (message.head.length < sizeof(*body))
goto message_too_short;
trace(printf("snapshot write request, %u ranges\n", body->count);)
struct addto snap = { .nextchunk = -1 };
for (i = 0; i < body->count; i++)
for (j = 0; j < body->ranges[i].chunks; j++) {
chunk_t chunk = body->ranges[i].chunk + j;
chunk_t exception = make_unique(sb, chunk, client->snap);
if (exception == -1)
error("snapshot store full"); // !!! don't abort, send error
trace(printf("exception = %Lx\n", exception);)
addto_response(&snap, chunk);
check_response_full(&snap, sizeof(chunk_t));
*(snap.top)++ = exception;
}
finish_copyout(sb);
commit_transaction(sb);
finish_reply(client->sock, &snap, REPLY_SNAPSHOT_WRITE, body->id);
break;
case QUERY_SNAPSHOT_READ:
{
struct rw_request *body = (struct rw_request *)message. body;
if (message.head.length < sizeof(*body))
goto message_too_short;
trace(printf("snapshot read request, %u ranges\n", body->count);)
struct addto snap = { .nextchunk = -1 }, org = { .nextchunk = -1 };
for (i = 0; i < body->count; i++)
for (j = 0; j < body->ranges[i].chunks; j++) {
chunk_t chunk = body->ranges[i].chunk + j, exception = 0;
trace(warn("read %Lx", chunk));
test_unique(sb, chunk, client->snap, &exception);
if (exception) {
trace(warn("read exception %Lx", exception));
addto_response(&snap, chunk);
check_response_full(&snap, sizeof(chunk_t));
*(snap.top)++ = exception;
} else {
trace(warn("read origin %Lx", chunk));
addto_response(&org, chunk);
readlock_chunk(sb, chunk, client);
}
}
finish_reply(client->sock, &org, REPLY_SNAPSHOT_READ_ORIGIN, body->id);
finish_reply(client->sock, &snap, REPLY_SNAPSHOT_READ, body->id);
break;
}
case FINISH_SNAPSHOT_READ:
{
struct rw_request *body = (struct rw_request *)message.body;
if (message.head.length < sizeof(*body))
goto message_too_short;
trace(printf("finish snapshot read, %u ranges\n", body->count);)
for (i = 0; i < body->count; i++)
for (j = 0; j < body->ranges[i].chunks; j++)
release_chunk(sb, body->ranges[i].chunk + j, client);
break;
}
case IDENTIFY:
{
int tag = ((struct identify *)message.body)->snap, snap = tag_snapnum(sb, tag);
if (snap >= 0)
client->snap = snap;
client->id = ((struct identify *)message.body)->id;
warn("client id %Li, snapshot %i (snapnum %i)", client->id, tag, snap);
outbead(sock, REPLY_IDENTIFY, struct { });
break;
}
case UPLOAD_LOCK:
break;
case FINISH_UPLOAD_LOCK:
break;
case CREATE_SNAPSHOT:
create_snapshot(sb, ((struct create_snapshot *)message.body)->snap);
save_state(sb);
outbead(sock, REPLY_CREATE_SNAPSHOT, struct { });
break;
case DELETE_SNAPSHOT:
delete_snapshot(sb, ((struct create_snapshot *)message.body)->snap);
save_state(sb);
outbead(sock, REPLY_DELETE_SNAPSHOT, struct { });
break;
case INITIALIZE_SNAPSTORE:
init_snapstore(sb);
break;
case DUMP_TREE:
show_tree(sb);
break;
case START_SERVER:
warn("Activating server");
load_sb(sb);
if (sb->image.flags & SB_BUSY) {
warn("Server was not shut down properly");
jtrace(show_journal(sb);)
recover_journal(sb);
} else {
sb->image.flags |= SB_BUSY;
set_sb_dirty(sb);
save_sb(sb);
}
break;
case LIST_SNAPSHOTS:
{
int i, n = sb->image.snapshots;
struct snapshot *snap = sb->image.snaplist;
outhead(sock, SNAPSHOT_LIST, sizeof(struct snaplist) + n * sizeof(struct snapinfo));
for (i = 0; i < n; i++)
write(sock, &(struct snapinfo){
.snap = snap->tag,
.prio = snap->prio,
.ctime = snap->ctime },
sizeof(struct snapinfo));
break;
}
case SHUTDOWN_SERVER:
return -2;
default:
outbead(sock, REPLY_ERROR, struct { int code; char error[50]; }, message.head.code, "Unknown message"); // wrong!!!
}
#if 0
static int messages = 0;
if (++messages == 5) {
warn(">>>>Simulate server crash<<<<");
exit(1);
}
#endif
return 0;
message_too_long:
warn("message %x too long (%u bytes)\n", message.head.code, message.head.length);
return -1;
message_too_short:
warn("message %x too short (%u bytes)\n", message.head.code, message.head.length);
return -1;
pipe_error:
return -1; /* we quietly drop the client if the connect breaks */
}
/* Signal Delivery via pipe */
static int sigpipe;
void sighandler(int signum)
{
trace_off(printf("caught signal %i\n", signum);)
write(sigpipe, (char[]){signum}, 1);
}
int cleanup(struct superblock *sb)
{
warn("cleaning up");
sb->image.flags &= ~SB_BUSY;
set_sb_dirty(sb);
save_state(sb);
return 0;
}
int resolve_host(char *name, int family, void *result, int length)
{
struct hostent host, *bogus;
char work[500];
int err, dumb;
if ((err = gethostbyname2_r(name, family, &host, work, sizeof(work), &bogus, &dumb))) {
errno = err;
return -1;
}
memcpy(result, host.h_addr_list[0], host.h_length);
return host.h_length;
}
int resolve_self(int family, void *result, int length)
{
char name[HOST_NAME_MAX + 1];
if (gethostname(name, HOST_NAME_MAX) == -1)
return -1;
return resolve_host(name, family, result, length);
}
int snap_server(struct superblock *sb, const char *sockname, int port)
{
unsigned maxclients = 100, clients = 0, others = 3;
struct client *clientvec[maxclients];
struct pollfd pollvec[others+maxclients];
int listener, getsig, pipevec[2], err = 0;
if (pipe(pipevec))
error("Can't open pipe");
sigpipe = pipevec[1];
getsig = pipevec[0];
struct server server = { .port = htons(port), .type = AF_INET, };
if ((listener = socket(AF_INET, SOCK_STREAM, 0)) < 0)
error("Can't get socket");
if (bind(listener,
(struct sockaddr *)&(struct sockaddr_in){
.sin_family = server.type,
.sin_port = server.port,
.sin_addr = { .s_addr = INADDR_ANY } },
sizeof(struct sockaddr_in)) < 0)
error("Can't bind to socket");
listen(listener, 5);
warn("snap server bound to port %i", port);
/* Get agent connection */
struct sockaddr_un addr = { .sun_family = AF_UNIX };
int addr_len = sizeof(addr) - sizeof(addr.sun_path) + strlen(sockname);
int sock, len;
trace(warn("Connect to control socket %s", sockname);)
if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1)
error("Can't get socket");
strncpy(addr.sun_path, sockname, sizeof(addr.sun_path));
if (sockname[0] == '@')
addr.sun_path[0] = 0;
if (connect(sock, (struct sockaddr *)&addr, addr_len) == -1)
error("Can't connect to control socket");
trace_on(warn("Received control connection");)
pollvec[2] = (struct pollfd){ .fd = sock, .events = POLLIN };
if ((len = resolve_self(AF_INET, server.address, sizeof(server.address))) == -1)
error("Can't get own address, %s (%i)", strerror(errno), errno);
server.address_len = len;
warn("host = %x/%u", *(int *)server.address, server.address_len);
writepipe(sock, &(struct head){ SERVER_READY, sizeof(server) }, sizeof(struct head));
writepipe(sock, &server, sizeof(server));
#if 1
switch (fork()) {
case -1:
error("fork failed");
case 0: // !!! daemonize goes here
break;
default:
return 0;
}
#endif
pollvec[0] = (struct pollfd){ .fd = listener, .events = POLLIN };
pollvec[1] = (struct pollfd){ .fd = getsig, .events = POLLIN };
signal(SIGINT, sighandler);
signal(SIGTERM, sighandler);
signal(SIGPIPE, SIG_IGN);
while (1) {
int activity = poll(pollvec, others+clients, -1);
if (activity < 0) {
if (errno != EINTR)
error("poll failed, %s", strerror(errno));
continue;
}
if (!activity) {
printf("waiting...\n");
continue;
}
/* New connection? */
if (pollvec[0].revents) {
struct sockaddr_in addr;
int addr_len = sizeof(addr), sock;
if (!(sock = accept(listener, (struct sockaddr *)&addr, &addr_len)))
error("Cannot accept connection");
trace_on(warn("Received connection");)
assert(clients < maxclients); // !!! send error and disconnect
struct client *client = malloc(sizeof(struct client));
*client = (struct client){ .sock = sock };
clientvec[clients] = client;
pollvec[others+clients] = (struct pollfd){ .fd = sock, .events = POLLIN };
clients++;
}
/* Signal? */
if (pollvec[1].revents) {
u8 sig = 0;
/* it's stupid but this read also gets interrupted, so... */
do { } while (read(getsig, &sig, 1) == -1 && errno == EINTR);
trace_on(warn("caught signal %i", sig);)
cleanup(sb); // !!! don't do it on segfault
if (sig == SIGINT) {
signal(SIGINT, SIG_DFL);
kill(getpid(), sig); /* commit harikiri */
}
goto done;
}
/* Agent message? */
if (pollvec[2].revents)
incoming(sb, &(struct client){ .sock = sock, .id = -2, .snap = -2 });
/* Client message? */
unsigned i = 0;
while (i < clients) {
if (pollvec[others+i].revents) { // !!! check for poll error
struct client *client = clientvec[i];
int result;
trace_off(printf("event on socket %i = %x\n", client->sock, pollvec[others+i].revents);)
if ((result = incoming(sb, client)) == -1) {
warn("Client %Li disconnected", client->id);
save_state(sb); // !!! just for now
close(client->sock);
free(client);
--clients;
clientvec[i] = clientvec[clients];
pollvec[others + i] = pollvec[others + clients];
continue;
}
if (result == -2) { // !!! wrong !!!
cleanup(sb);
goto done;
}
}
i++;
}
}
done:
// in a perfect world we'd close all the connections
close(listener);
return err;
}
#if 0
void usage(poptContext optCon, int exitcode, char *error, char *addl) {
poptPrintUsage(optCon, stderr, 0);
if (error) fprintf(stderr, "%s: %s0", error, addl);
exit(exitcode);
}
#endif
static void *useme[] = {
_show_journal, delete_tree_range, expand_snapstore,
show_tree_range, snapnum_tag, show_snapshots,
};
int main(int argc, const char *argv[])
{
poptContext optCon;
unsigned volsize;
struct poptOption optionsTable[] = {
{ "size", 's', POPT_ARG_INT, &volsize, 0, "volume size", "size" },
POPT_AUTOHELP
{ NULL, 0, 0, NULL, 0 }
};
optCon = poptGetContext(NULL, argc, argv, optionsTable, 0);
#ifdef SERVER
poptSetOtherOptionHelp(optCon, "dev/snapshot dev/origin socket port");
if (argc < 5) {
#else
poptSetOtherOptionHelp(optCon, "dev/snapshot dev/origin");
if (argc < 3) {
#endif
poptPrintUsage(optCon, stderr, 0);
exit(1);
}
char c;
while ((c = poptGetNextOpt(optCon)) >= 0)
;
#if 0
snapname = poptGetArg(optCon);
if(!(poptPeekArg(optCon) == NULL)) {
poptPrintUsage(optCon, stderr, 0);
exit(1);
}
#endif
if (c < -1) {
fprintf(stderr, "%s: %s\n",
poptBadOption(optCon, POPT_BADOPTION_NOALIAS),
poptStrerror(c));
return 1;
}
struct superblock *sb = &(struct superblock){};
init_buffers();
#ifdef SERVER
if (argc < 5)
error("usage: %s dev/snapshot dev/origin socket port", argv[0]);
#else
if (argc < 3)
error("usage: %s dev/snapshot dev/origin", argv[0]);
#endif
if (!(sb->snapdev = open(poptGetArg(optCon), O_RDWR | O_DIRECT)))
error("Could not open snapshot store %s", argv[1]);
if (!(sb->orgdev = open(poptGetArg(optCon), O_RDONLY | O_DIRECT)))
error("Could not open origin volume %s", argv[2]);
#ifdef SERVER
const char *sockname = poptGetArg(optCon);
int port = atoi(poptGetArg(optCon));
poptFreeContext(optCon);
return snap_server(sb, sockname, port);
#else
poptFreeContext(optCon);
#if 0
init_snapstore(sb);
create_snapshot(sb, 0);
int i;
for (i = 0; i < 100; i++) {
make_unique(sb, i, 0);
}
flush_buffers();
evict_buffers();
warn("delete...");
delete_tree_range(sb, 1, 0, 5);
show_buffers();
warn("dirty buffers = %i", dirty_buffer_count);
show_tree(sb);
return 0;
#endif
return init_snapstore(sb);
#endif
if (useme) ;
}

File Metadata

Mime Type
text/x-c
Expires
Mon, Feb 24, 2:18 PM (6 h, 31 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1464258
Default Alt Text
ddsnap.c (74 KB)

Event Timeline