Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/gfs-kernel/src/gfs/daemon.c b/gfs-kernel/src/gfs/daemon.c
index 378e0a794..c0c805789 100644
--- a/gfs-kernel/src/gfs/daemon.c
+++ b/gfs-kernel/src/gfs/daemon.c
@@ -1,181 +1,185 @@
#include <linux/kthread.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/spinlock.h>
#include <linux/buffer_head.h>
#include "gfs.h"
#include "daemon.h"
#include "glock.h"
#include "log.h"
#include "quota.h"
#include "recovery.h"
#include "super.h"
#include "unlinked.h"
+#include "rgrp.h"
/**
* gfs_scand - Look for cached glocks and inodes to toss from memory
* @sdp: Pointer to GFS superblock
*
* One of these daemons runs, finding candidates to add to sd_reclaim_list.
* See gfs_glockd()
*/
int
gfs_scand(void *data)
{
struct gfs_sbd *sdp = (struct gfs_sbd *)data;
while (!kthread_should_stop()) {
gfs_scand_internal(sdp);
schedule_timeout_interruptible(gfs_tune_get(sdp, gt_scand_secs) * HZ);
}
return 0;
}
/**
* gfs_glockd - Reclaim unused glock structures
* @sdp: Pointer to GFS superblock
*
* One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
* sd_glockd_num says how many daemons are running now.
* Number of daemons can be set by user, with num_glockd mount option.
* See gfs_scand()
*/
int
gfs_glockd(void *data)
{
struct gfs_sbd *sdp = (struct gfs_sbd *)data;
while (!kthread_should_stop()) {
while (atomic_read(&sdp->sd_reclaim_count))
gfs_reclaim_glock(sdp);
wait_event_interruptible(sdp->sd_reclaim_wchan,
(atomic_read(&sdp->sd_reclaim_count) ||
kthread_should_stop()));
}
return 0;
}
/**
* gfs_recoverd - Recover dead machine's journals
* @sdp: Pointer to GFS superblock
*
*/
int
gfs_recoverd(void *data)
{
struct gfs_sbd *sdp = (struct gfs_sbd *)data;
while (!kthread_should_stop()) {
gfs_check_journals(sdp);
schedule_timeout_interruptible(gfs_tune_get(sdp, gt_recoverd_secs) * HZ);
}
return 0;
}
/**
* gfs_logd - Update log tail as Active Items get flushed to in-place blocks
* @sdp: Pointer to GFS superblock
*
* Also, periodically check to make sure that we're using the most recent
* journal index.
*/
int
gfs_logd(void *data)
{
struct gfs_sbd *sdp = (struct gfs_sbd *)data;
struct gfs_holder ji_gh;
while (!kthread_should_stop()) {
/* Advance the log tail */
gfs_ail_empty(sdp);
/* Check for latest journal index */
if (time_after_eq(jiffies,
sdp->sd_jindex_refresh_time +
gfs_tune_get(sdp, gt_jindex_refresh_secs) * HZ)) {
if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags) &&
!gfs_jindex_hold(sdp, &ji_gh))
gfs_glock_dq_uninit(&ji_gh);
sdp->sd_jindex_refresh_time = jiffies;
}
schedule_timeout_interruptible(gfs_tune_get(sdp, gt_logd_secs) * HZ);
}
return 0;
}
/**
* gfs_quotad - Write cached quota changes into the quota file
* @sdp: Pointer to GFS superblock
*
*/
int
gfs_quotad(void *data)
{
struct gfs_sbd *sdp = (struct gfs_sbd *)data;
int error;
while (!kthread_should_stop()) {
/* Update statfs file */
if (gfs_tune_get(sdp, gt_statfs_fast) &&
time_after_eq(jiffies,
sdp->sd_statfs_sync_time +
gfs_tune_get(sdp, gt_statfs_fast) * HZ)) {
error = gfs_statfs_sync(sdp);
if (error && error != -EROFS &&
!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
printk("GFS: fsid=%s: statfs: error = %d\n",
sdp->sd_fsname, error);
sdp->sd_statfs_sync_time = jiffies;
}
/* Update quota file */
if (time_after_eq(jiffies,
sdp->sd_quota_sync_time +
gfs_tune_get(sdp, gt_quota_quantum) * HZ)) {
error = gfs_quota_sync(sdp);
if (error &&
error != -EROFS &&
!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
printk("GFS: fsid=%s: quotad: error = %d\n",
sdp->sd_fsname, error);
sdp->sd_quota_sync_time = jiffies;
}
/* Clean up */
gfs_quota_scan(sdp);
schedule_timeout_interruptible(gfs_tune_get(sdp, gt_quotad_secs) * HZ);
}
return 0;
}
/**
* gfs_inoded - Deallocate unlinked inodes
* @sdp: Pointer to GFS superblock
*
*/
int
gfs_inoded(void *data)
{
struct gfs_sbd *sdp = (struct gfs_sbd *)data;
while (!kthread_should_stop()) {
+ uint64_t inodes, metadata;
gfs_unlinked_dealloc(sdp);
+ gfs_reclaim_metadata(sdp, &inodes, &metadata,
+ gfs_tune_get(sdp, gt_max_rgrp_free_mdata));
schedule_timeout_interruptible(gfs_tune_get(sdp, gt_inoded_secs) * HZ);
}
return 0;
}
diff --git a/gfs-kernel/src/gfs/incore.h b/gfs-kernel/src/gfs/incore.h
index fedde4920..bad76fc6c 100644
--- a/gfs-kernel/src/gfs/incore.h
+++ b/gfs-kernel/src/gfs/incore.h
@@ -1,1207 +1,1211 @@
/*
* In-core (memory/RAM) structures.
* These do not appear on-disk. See gfs_ondisk.h for on-disk structures.
*/
#ifndef __INCORE_DOT_H__
#define __INCORE_DOT_H__
/* flags used in function call parameters */
#define DIO_NEW (0x00000001) /* Newly allocated metadata */
#define DIO_FORCE (0x00000002) /* Force read of block from disk */
#define DIO_CLEAN (0x00000004) /* Don't write to disk */
#define DIO_DIRTY (0x00000008) /* Data changed, must write to disk */
#define DIO_START (0x00000010) /* Start disk read or write */
#define DIO_WAIT (0x00000020) /* Wait for disk r/w to complete */
#define DIO_METADATA (0x00000040) /* Process glock's protected metadata */
#define DIO_DATA (0x00000080) /* Process glock's protected filedata */
#define DIO_INVISIBLE (0x00000100) /* Don't monkey with glock's dirty bit */
#define DIO_CHECK (0x00000200) /* Make sure all metadata has been synced */
#define DIO_ALL (0x00000400) /* Flush all AIL transactions to disk */
/* Structure prototypes */
struct gfs_log_operations;
struct gfs_log_element;
struct gfs_meta_header_cache;
struct gfs_depend;
struct gfs_bitmap;
struct gfs_rgrpd;
struct gfs_bufdata;
struct gfs_glock_operations;
struct gfs_holder;
struct gfs_glock;
struct gfs_alloc;
struct gfs_inode;
struct gfs_file;
struct gfs_unlinked;
struct gfs_quota_le;
struct gfs_quota_data;
struct gfs_log_buf;
struct gfs_trans;
struct gfs_gl_hash_bucket;
struct gfs_sbd;
typedef void (*gfs_glop_bh_t) (struct gfs_glock * gl, unsigned int ret);
/*
* Structure of operations that are associated with each
* type of element in the log.
*/
struct gfs_log_operations {
/*
* Operations specific to a given log element (LE).
* These are typically executed individually via macros such as LO_ADD.
*/
/* Add new LE to transaction */
void (*lo_add) (struct gfs_sbd * sdp, struct gfs_log_element * le);
/* Do any cleanup, etc., needed just before commit to incore log */
void (*lo_trans_end) (struct gfs_sbd * sdp,
struct gfs_log_element * le);
/* Print LE-specific info via printk() */
void (*lo_print) (struct gfs_sbd * sdp, struct gfs_log_element * le,
unsigned int where);
/* Find any incore transactions that overlap through this LE (e.g.
* share glocks), to determine if any transactions can be combined. */
struct gfs_trans *(*lo_overlap_trans) (struct gfs_sbd * sdp,
struct gfs_log_element * le);
/* Change LE from "new" to "incore" status, before write to log */
void (*lo_incore_commit) (struct gfs_sbd * sdp, struct gfs_trans * tr,
struct gfs_log_element * le);
/* Allow writes to in-place locations, after log is on-disk */
void (*lo_add_to_ail) (struct gfs_sbd * sdp,
struct gfs_log_element * le);
/* Clean up LE after log dump */
void (*lo_clean_dump) (struct gfs_sbd * sdp,
struct gfs_log_element * le);
/*
* Operations specific to a class of log elements.
* These are typically executed over a whole transaction by
* macros such as LO_TRANS_SIZE. Each LE-type-specific operation
* for each LE contributes its part to the overall result.
*/
/* Determine LE-type-specific quantities of blocks of various types
* required for writing the log */
void (*lo_trans_size) (struct gfs_sbd * sdp, struct gfs_trans * tr,
unsigned int *mblks, unsigned int *eblks,
unsigned int *blocks, unsigned int *bmem);
/* Combine LE-type-specific values in new_tr and tr, result is in tr */
void (*lo_trans_combine) (struct gfs_sbd * sdp, struct gfs_trans * tr,
struct gfs_trans * new_tr);
/* Create control and metadata buffers that will make up the log */
void (*lo_build_bhlist) (struct gfs_sbd * sdp, struct gfs_trans * tr);
/* Calculate log space needed for this LE in a log dump */
void (*lo_dump_size) (struct gfs_sbd * sdp, unsigned int *elements,
unsigned int *blocks, unsigned int *bmem);
/* Add LE to log dump */
void (*lo_build_dump) (struct gfs_sbd * sdp, struct gfs_trans * tr);
/*
* Operations that happen at recovery time
*/
/* Reset/init whatever before doing recovery */
void (*lo_before_scan) (struct gfs_sbd * sdp, unsigned int jid,
struct gfs_log_header * head,
unsigned int pass);
/* LE-specific recovery procedure */
int (*lo_scan_elements) (struct gfs_sbd * sdp,
struct gfs_jindex * jdesc,
struct gfs_glock * gl, uint64_t start,
struct gfs_log_descriptor * desc,
unsigned int pass);
/* Verify and report recovery results/statistics */
void (*lo_after_scan) (struct gfs_sbd * sdp, unsigned int jid,
unsigned int pass);
/*
* Type of element (glock/buf/unlinked/quota)
*/
char *lo_name;
};
/*
* Structure that gets added to struct gfs_trans->tr_elements. They
* make up the "stuff" in each transaction.
*/
struct gfs_log_element {
struct gfs_log_operations *le_ops; /* Vector of functions */
struct gfs_trans *le_trans; /* We're part of this transaction */
struct list_head le_list; /* Link to transaction's element list */
};
/*
* Meta-header cache structure.
* One for each metadata block that we've de-allocated.
* Used to temporarily store gfs_meta_header structs for meta blocks that
* have been freshly turned into FREEMETA (alloc'd or de-alloc'd). Storing
* these (small) structures in-core allows us to release the (large) buffers,
* and not need to re-read the header from disk if/when we re-allocate the
* blocks to USEDMETA, as long as this node holds the EXCLUSIVE lock for the
* resource group containing the blocks. If we release the EX lock, we must
* throw away the rgrp's cached meta headers, since another node could change
* the blocks' contents.
* In-core superblock structure hosts the hashed cache, as well as a
* linear list of all cached, in most-recently-added order.
* Also, each resource group keeps a list of cached blocks within its scope.
*/
struct gfs_meta_header_cache {
/* Links to various lists */
struct list_head mc_list_hash; /* Superblock's hashed list */
struct list_head mc_list_single; /* Superblock's list, MRU order */
struct list_head mc_list_rgd; /* Resource group's list */
uint64_t mc_block; /* Block # (in-place address) */
struct gfs_meta_header mc_mh; /* Payload: the block's meta-header */
};
/*
* Dependency cache structure.
* In-core superblock structure hosts the actual cache.
* Also, each resource group keeps a list of dependency blocks within its scope.
*/
struct gfs_depend {
/* Links to various lists */
struct list_head gd_list_hash; /* Superblock's hashed list */
struct list_head gd_list_rgd; /* Resource group's list */
struct gfs_rgrpd *gd_rgd; /* Resource group descriptor */
uint64_t gd_formal_ino; /* Inode ID */
unsigned long gd_time; /* Time (jiffies) when put on list */
};
/*
* Block allocation bitmap descriptor structure.
* One of these for each FS block that contains bitmap data
* (i.e. the resource group header blocks and their following bitmap blocks).
* Each allocatable FS data block is represented by 2 bits (4 alloc states).
*/
struct gfs_bitmap {
uint32_t bi_offset; /* Byte offset of bitmap within this bit block
(non-zero only for an rgrp header block) */
uint32_t bi_start; /* Data block (rgrp scope, 32-bit) represented
by the first bit-pair in this bit block */
uint32_t bi_len; /* The number of bitmap bytes in this bit block */
};
+#define RD_FL_META2FREE (0x00000001) /* rgrpd has freeable metadata */
+
/*
* Resource Group (Rgrp) descriptor structure.
* There is one of these for each resource (block) group in the FS.
* The filesystem is divided into a number of resource groups to allow
* simultaneous block alloc operations by a number of nodes.
*/
struct gfs_rgrpd {
/* Links to superblock lists */
struct list_head rd_list; /* On-disk-order list of all rgrps */
struct list_head rd_list_mru; /* Most Recently Used list of all rgs */
struct list_head rd_recent; /* recently used rgrps */
uint32_t rd_try_counter; /* # of times we fail a try lock */
struct gfs_glock *rd_gl; /* Glock for this rgrp */
struct gfs_rindex rd_ri; /* Resource Index (on-disk) structure */
struct gfs_rgrp rd_rg; /* Resource Group (on-disk) structure */
uint64_t rd_rg_vn; /* Version #: if != glock's gl_vn,
we need to read rgrp fm disk */
/* Block alloc bitmap cache */
struct gfs_bitmap *rd_bits; /* Array of block bitmap descriptors */
struct buffer_head **rd_bh; /* Array of ptrs to block bitmap bh's */
/* Block allocation strategy, rgrp scope. Start at these blocks when
searching for next data/meta block to alloc */
uint32_t rd_last_alloc_data; /* Most recent data block allocated */
uint32_t rd_last_alloc_meta; /* Most recent meta block allocated */
struct list_head rd_mhc; /* Cached meta-headers for this rgrp */
struct list_head rd_depend; /* Dependent inodes (MRU order) */
struct gfs_sbd *rd_sbd; /* FS incore superblock (fs instance) */
+ uint32_t rd_flags; /* RD_FL_XXX : flags for this rgrpd */
};
/*
* Per-buffer data
* One of these is attached as GFS private data to each FS block's buffer_head.
* These keep track of a buffer's progress through the transaction pipeline,
* using the "new" embedded log element to attach it to a being-built
* transaction, and moving the attachment point to the "incore" LE once
* the transaction completes (at which time the buffer becomes a candidate
* to be written to the on-disk log).
* A buffer may be attached simultaneously to a new and an incore transaction,
* but no more than one of each: Only one new trans may be built at a time
* for a given buffer, obviously, since the buffer's contents are protected
* by an EXclusive glock when writing. And, when a transaction is completely
* built, GFS combines incore transactions that share glocks (see
* incore_commit()), i.e. the glock that protects the buffer, so a buffer
* never needs to be attached to more than one (combined) incore trans.
* Note that multiple transactions can modify the buffer since its most
* recent writes to disk. This principle applies to both in-place and
* journal block locations on-disk, allowing this node to keep modifying the
* cached data without writing it to disk, unless/until another node needs
* to access the data, or the Linux OS tells us to sync to disk.
* If a transaction follows another transaction before the first transaction's
* log completes (indicated by the in-place buffer head still being pinned
* in RAM), GFS copies the first transaction's results to a "frozen"
* image of the buffer, so the first transaction results (an atomic
* snapshot) can be logged properly, while the second transaction is
* modifying the "real" buffer. This frozen copy lives only until the new
* transaction is complete, at which point one of two things has occurred:
* 1). Buffer was logged successfully; frozen copy's job is done.
* 2). Buffer was not yet logged; frozen copy no longer needed, newer
* buffer becomes the log candidate.
*
* gfs_bufdata structs also link into the Active Items Lists (AIL) (buffers
* flushed to on-disk log, but not yet flushed to on-disk in-place locations)
* attached to:
* 1). The latest transaction to modify and log (on-disk) the buffer, and
* 2). The glock that protects the buffer's contents.
* The buffer is attached to only the most recent transaction's AIL
* list for a couple of reasons. One is that only the most up-to-date
* buffer content needs to be written to the in-place block on-disk. The
* other is that since there is a more recent copy of the block in
* the log, we don't need to keep the older copies in the log. We can
* remove them from the AIL and let the log space be reused for new
* transactions (GFS advances the log tail when removing buffers from AIL).
*/
struct gfs_bufdata {
struct buffer_head *bd_bh; /* We belong to this Linux buffer_head */
struct gfs_glock *bd_gl; /* This glock protects buffer's payload */
/* Log elements map us to a particular set of log operations functions,
and to a particular transaction */
struct gfs_log_element bd_new_le; /* New, incomplete transaction */
struct gfs_log_element bd_incore_le; /* Complete (committed) trans */
char *bd_frozen; /* "Frozen" copy of buffer's data */
struct semaphore bd_lock; /* Protects access to this structure */
/* "Pin" means keep buffer in RAM, don't write to disk (yet) */
unsigned int bd_pinned; /* Recursive pin count */
/* Links to Active Items Lists */
struct list_head bd_ail_tr_list; /* This buf's most recent trans' AIL */
struct list_head bd_ail_gl_list; /* This buf's glock's AIL */
};
/*
* Glock operations
* One set of operations for each glock, the set selected by type of glock.
* These functions get called at various points in a glock's lifetime.
* "xmote" = promote or demote (change lock state) a glock at inter-node scope.
* "th" = top half, "bh" = bottom half
* Some operations/fields are required (GFS assumes they are there):
* go_xmote_th
* go_drop_th
* go_type
* Other operations are optional (GFS checks for presence before calling).
*/
struct gfs_glock_operations {
/* Acquire lock or change lock state at inter-node scope:
Does type-specific preparation (if any)
Uses gfs_glock_xmote_th to call lock module. */
void (*go_xmote_th) (struct gfs_glock * gl, unsigned int state,
int flags);
/* After acquiring or changing a lock at inter-node scope */
void (*go_xmote_bh) (struct gfs_glock * gl);
/* Release (unlock) a lock at inter-node scope:
Does type-specific preparation (if any)
Uses gfs_glock_drop_th to call lock module. */
void (*go_drop_th) (struct gfs_glock * gl);
/* After releasing a lock at inter-node scope */
void (*go_drop_bh) (struct gfs_glock * gl);
/* Sync dirty data to disk (e.g. before demoting an EX inter-node lock)
(another node needs to read the updated data from disk) */
void (*go_sync) (struct gfs_glock * gl, int flags);
/* Invalidate local cached data just after releasing an inter-node lock
(another node may change the on-disk data, so it's no good to us) */
void (*go_inval) (struct gfs_glock * gl, int flags);
/* Lock-type-specific check to see if it's okay to unlock a glock
at inter-node scope (and remove it from our glock cache) */
int (*go_demote_ok) (struct gfs_glock * gl);
/* After getting lock for first holder (within this node) */
int (*go_lock) (struct gfs_glock * gl, int flags);
/* After last holder (within this node) gives up lock (glock may
remain in glock cache, though) */
void (*go_unlock) (struct gfs_glock * gl, int flags);
/* After receiving a callback: another node needs the lock */
void (*go_callback) (struct gfs_glock * gl, unsigned int state);
/* Called when the glock layer marks a lock as being not greedy
anymore */
void (*go_greedy) (struct gfs_glock * gl);
/* Lock type: locks with same lock # (often an FS block #),
but different types, are different locks */
int go_type;
};
/*
* Glock holder structure
* One for each holder of a glock.
* These coordinate the use, within this node, of an acquired inter-node glock.
* Once a node has acquired a glock, it may be shared within that node by
* several processes, or even by several recursive requests from the same
* process. Each is a separate "holder". Different holders may co-exist
* having requested different lock states, as long as the node holds the
* glock in a state that is compatible. A hold requestor may select, via
* flags, the rules by which sharing within the node is granted:
* LM_FLAG_ANY: Grant if glock state is any other than UNLOCKED.
* GL_EXACT: Grant only if glock state is exactly the requested state.
* GL_LOCAL_EXCL: Grant only one holder at a time within this node.
* With no flags, a hold will be granted to a SHARED request even if the
* node holds the glock in EXCLUSIVE mode. See relaxed_state_ok().
* When a process needs to manipulate a lock, it requests it via one of
* these holder structures. If the request cannot be satisfied immediately,
* the holder structure gets queued on one of these lists in gfs_glock:
* 1) waiters1, for gaining exclusive access to the (local) glock structure.
* 2) waiters2, for demoting a lock (unlocking a glock, or changing its state
* to be less restrictive) or relenquishing "greedy" status.
* 3) waiters3, for promoting (locking a new glock, or changing a glock state
* to be more restrictive).
* When holding a lock, gfs_holder struct stays on glock's holder list.
* See gfs-kernel/src/harness/lm_interface.h for gh_state (LM_ST_...)
* and gh_flags (LM_FLAG...) fields.
* Also see glock.h for gh_flags field (GL_...) flags.
*/
/* Action requests */
#define HIF_MUTEX (0) /* Exclusive (local) access to glock struct */
#define HIF_PROMOTE (1) /* Change lock to more restrictive state */
#define HIF_DEMOTE (2) /* Change lock to less restrictive state */
#define HIF_GREEDY (3) /* Wait for the glock to be unlocked */
/* States */
#define HIF_ALLOCED (4) /* Holder structure is or was in use */
#define HIF_DEALLOC (5) /* Toss holder struct as soon as queued request
* is satisfied */
#define HIF_HOLDER (6) /* We have been granted a hold on the lock */
#define HIF_FIRST (7) /* We are first holder to get the lock */
#define HIF_RECURSE (8) /* >1 hold requests on same glock by same process*/
#define HIF_ABORTED (9) /* Aborted before being submitted */
struct gfs_holder {
struct list_head gh_list; /* Link to one of glock's holder lists */
struct gfs_glock *gh_gl; /* Glock that we're holding */
struct task_struct *gh_owner; /* Linux process that is the holder */
/* request to change lock state */
unsigned int gh_state; /* LM_ST_... requested lock state */
int gh_flags; /* GL_... or LM_FLAG_... req modifiers */
int gh_error; /* GLR_... CANCELLED/TRYFAILED/-errno */
unsigned long gh_iflags; /* HIF_... holder state, see above */
struct completion gh_wait; /* Wait for completion of ... */
};
/*
* Glock Structure
* One for each inter-node lock held by this node.
* A glock is a local representation/abstraction of an inter-node lock.
* Inter-node locks are managed by a "lock module" (LM) which plugs in to
* the lock harness / glock interface (see gfs-kernel/harness). Different
* lock modules support different lock protocols (e.g. GULM, GDLM, no_lock).
* A glock may have one or more holders within a node. See gfs_holder above.
* Glocks are managed within a hash table hosted by the in-core superblock.
* After all holders have released a glock, it will stay in the hash table
* cache for a time (depending on lock type), during which the inter-node
* lock will not be released unless another node needs the lock (lock
* manager requests this via callback to GFS through LM on this node). This
* provides better performance in case this node needs the glock again soon.
* See comments for meta_go_demote_ok(), glops.c.
* Each glock has an associated vector of lock-type-specific "glops" functions
* which are called at important times during the life of a glock, and
* which define the type of lock (e.g. dinode, rgrp, non-disk, etc).
* See gfs_glock_operations above.
* A glock, at inter-node scope, is identified by the following dimensions:
* 1) lock number (usually a block # for on-disk protected entities,
* or a fixed assigned number for non-disk locks, e.g. MOUNT).
* 2) lock type (actually, the type of entity protected by the lock).
* 3) lock namespace, to support multiple GFS filesystems simultaneously.
* Namespace (usually cluster:filesystem) is specified when mounting.
* See man page for gfs_mount.
* Glocks require support of Lock Value Blocks (LVBs) by the inter-node lock
* manager. LVBs are small (32-byte) chunks of data associated with a given
* lock, that can be quickly shared between cluster nodes. Used for certain
* purposes such as sharing an rgroup's block usage statistics without
* requiring the overhead of:
* -- sync-to-disk by one node, then a
* -- read from disk by another node.
*
*/
#define GLF_PLUG (0) /* Dummy */
#define GLF_LOCK (1) /* Exclusive (local) access to glock
* structure */
#define GLF_STICKY (2) /* Don't release this inter-node lock
* unless another node explicitly asks */
#define GLF_PREFETCH (3) /* This lock has been (speculatively)
* prefetched, demote if not used soon */
#define GLF_SYNC (4) /* Sync lock's protected data as soon as
* there are no more holders */
#define GLF_DIRTY (5) /* There is dirty data for this lock,
* sync before releasing inter-node */
#define GLF_SKIP_WAITERS2 (6) /* Make run_queue() ignore gl_waiters2
* (demote/greedy) holders */
#define GLF_GREEDY (7) /* This lock is ignoring callbacks
* (requests from other nodes) for now */
struct gfs_glock {
struct list_head gl_list; /* Link to hb_list in one of superblock's
* sd_gl_hash glock hash table buckets */
unsigned long gl_flags; /* GLF_... see above */
struct lm_lockname gl_name; /* Lock number and lock type */
atomic_t gl_count; /* Usage count */
spinlock_t gl_spin; /* Protects some members of this struct */
/* Lock state reflects inter-node manager's lock state */
unsigned int gl_state; /* LM_ST_... see harness/lm_interface.h */
/* Lists of gfs_holders */
struct list_head gl_holders; /* all current holders of the glock */
struct list_head gl_waiters1; /* HIF_MUTEX */
struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */
struct list_head gl_waiters3; /* HIF_PROMOTE */
struct gfs_glock_operations *gl_ops; /* function vector, defines type */
/* State to remember for async lock requests */
struct gfs_holder *gl_req_gh; /* Holder for request being serviced */
gfs_glop_bh_t gl_req_bh; /* The bottom half to execute */
void *gl_lock; /* Lock module's private lock data */
char *gl_lvb; /* Lock Value Block */
atomic_t gl_lvb_count; /* LVB recursive usage (hold/unhold) count */
uint64_t gl_vn; /* Incremented when protected data changes */
unsigned long gl_stamp; /* Glock cache retention timer */
void *gl_object; /* The protected entity (e.g. a dinode) */
/* Incore transaction stuff */
/* Log elements map us to a particular set of log operations functions,
and to a particular transaction */
struct gfs_log_element gl_new_le; /* New, incomplete transaction */
struct gfs_log_element gl_incore_le; /* Complete (committed) trans */
struct gfs_gl_hash_bucket *gl_bucket; /* Our bucket in sd_gl_hash */
struct list_head gl_reclaim; /* Link to sd_reclaim_list */
struct gfs_sbd *gl_sbd; /* Superblock (FS instance) */
struct inode *gl_aspace; /* The buffers protected by this lock */
struct list_head gl_ail_bufs; /* AIL buffers protected by us */
};
/*
* In-Place Reservation structure
* Coordinates allocation of "in-place" (as opposed to journal) FS blocks,
* which contain persistent inode/file/directory data and metadata.
* These blocks are the allocatable blocks within resource groups (i.e.
* not including rgrp header and block alloc bitmap blocks).
* gfs_inplace_reserve() calculates a fulfillment plan for allocating blocks,
* based on block statistics in the resource group headers.
* Then, gfs_blkalloc() or gfs_metaalloc() walks the block alloc bitmaps
* to do the actual allocation.
*/
struct gfs_alloc {
/* Up to 4 quotas (including an inode's user and group quotas)
can track changes in block allocation */
unsigned int al_qd_num; /* # of quotas tracking changes */
struct gfs_quota_data *al_qd[4]; /* Ptrs to quota structures */
struct gfs_holder al_qd_ghs[4]; /* Holders for quota glocks */
/* Request, filled in by the caller to gfs_inplace_reserve() */
uint32_t al_requested_di; /* Number of dinodes to reserve */
uint32_t al_requested_meta; /* Number of metadata blocks to reserve */
uint32_t al_requested_data; /* Number of data blocks to reserve */
/* Fulfillment plan, filled in by gfs_inplace_reserve() */
char *al_file; /* Debug info, .c file making request */
unsigned int al_line; /* Debug info, line of code making req */
struct gfs_holder al_ri_gh; /* Glock holder for resource grp index */
struct gfs_holder al_rgd_gh; /* Glock holder for al_rgd rgrp */
struct gfs_rgrpd *al_rgd; /* Resource group from which to alloc */
uint32_t al_reserved_meta; /* Alloc up to this # meta blocks from al_rgd */
uint32_t al_reserved_data; /* Alloc up to this # data blocks from al_rgd */
/* Actual alloc, filled in by gfs_blkalloc()/gfs_metaalloc(), etc. */
uint32_t al_alloced_di; /* # dinode blocks allocated */
uint32_t al_alloced_meta; /* # meta blocks allocated */
uint32_t al_alloced_data; /* # data blocks allocated */
/* Dinode allocation crap */
struct gfs_unlinked *al_ul; /* Unlinked dinode log entry */
};
/*
* Incore inode structure
*/
#define GIF_QD_LOCKED (0)
#define GIF_PAGED (1)
#define GIF_SW_PAGED (2)
struct gfs_inode {
struct gfs_inum i_num; /* Formal inode # and block address */
atomic_t i_count; /* Usage count */
unsigned long i_flags; /* GIF_... see above */
uint64_t i_vn; /* Version #: if different from glock's vn,
we need to read inode from disk */
struct gfs_dinode i_di; /* Dinode (on-disk) structure */
struct gfs_glock *i_gl; /* This glock protects this inode */
struct gfs_sbd *i_sbd; /* Superblock (fs instance structure) */
struct inode *i_vnode; /* Linux VFS inode structure */
struct gfs_holder i_iopen_gh; /* Glock holder for Inode Open lock */
/* Block allocation strategy, inode scope */
struct gfs_alloc *i_alloc; /* In-place block reservation structure */
uint64_t i_last_rg_alloc; /* Most recent blk alloc was fm this rgrp */
spinlock_t i_spin;
struct rw_semaphore i_rw_mutex;
/* Cache of most-recently used buffers in indirect addressing chain */
struct buffer_head *i_cache[GFS_MAX_META_HEIGHT];
unsigned int i_greedy; /* The amount of time to be greedy */
unsigned long i_last_pfault; /* The time of the last page fault */
struct address_space_operations gfs_file_aops;
};
/*
* GFS per-fd structure
*/
#define GFF_DID_DIRECT_ALLOC (0)
struct gfs_file {
unsigned long f_flags; /* GFF_... see above */
struct semaphore f_fl_lock; /* Lock to protect flock operations */
struct gfs_holder f_fl_gh; /* Holder for this f_vfile's flock */
struct gfs_inode *f_inode; /* Incore GFS inode */
struct file *f_vfile; /* Linux file struct */
};
/*
* Unlinked inode log entry incore structure
*/
#define ULF_NEW_UL (0) /* Part of new (being built) trans */
#define ULF_INCORE_UL (1) /* Part of incore-committed trans */
#define ULF_IC_LIST (2)
#define ULF_OD_LIST (3)
#define ULF_LOCK (4) /* Protects access to this structure */
struct gfs_unlinked {
struct list_head ul_list; /* Link to superblock's sd_unlinked_list */
unsigned int ul_count; /* Usage count */
struct gfs_inum ul_inum; /* Formal inode #, block addr */
unsigned long ul_flags; /* ULF_... */
/* Log elements map us to a particular set of log operations functions,
and to a particular transaction */
struct gfs_log_element ul_new_le; /* New, not yet committed */
struct gfs_log_element ul_incore_le; /* Committed to incore log */
struct gfs_log_element ul_ondisk_le; /* Committed to ondisk log */
};
/*
* Quota log element
* One for each logged change in a block alloc value affecting a given quota.
* Only one of these for a given quota within a given transaction;
* multiple changes, within one transaction, for a given quota will be
* combined into one log element.
*/
struct gfs_quota_le {
/* Log element maps us to a particular set of log operations functions,
and to a particular transaction */
struct gfs_log_element ql_le; /* Generic log element structure */
struct gfs_quota_data *ql_data; /* The quota we're changing */
struct list_head ql_data_list; /* Link to quota's log element list */
int64_t ql_change; /* # of blocks alloc'd (+) or freed (-) */
};
/*
* Quota structure
* One for each user or group quota.
* Summarizes all block allocation activity for a given quota, and supports
* recording updates of current block alloc values in GFS' special quota
* file, including the journaling of these updates, encompassing
* multiple transactions and log dumps.
*/
#define QDF_USER (0) /* User (1) vs. group (0) quota */
#define QDF_OD_LIST (1) /* Waiting for sync to quota file */
#define QDF_LOCK (2) /* Protects access to this structure */
struct gfs_quota_data {
struct list_head qd_list; /* Link to superblock's sd_quota_list */
unsigned int qd_count; /* Usage count */
uint32_t qd_id; /* User or group ID number */
unsigned long qd_flags; /* QDF_... */
/* This list is for non-log-dump transactions */
struct list_head qd_le_list; /* List of gfs_quota_le log elements */
/* Summary of block alloc changes affecting this quota, in various
stages of logging & syncing changes to the special quota file */
int64_t qd_change_new; /* New, not yet committed to in-core log*/
int64_t qd_change_ic; /* Committed to in-core log */
int64_t qd_change_od; /* Committed to on-disk log */
int64_t qd_change_sync; /* Being synced to the in-place quota file */
struct gfs_quota_le qd_ondisk_ql; /* Log element for log dump */
uint64_t qd_sync_gen; /* Sync-to-quota-file generation # */
/* Glock provides protection for quota, *and* provides
lock value block (LVB) communication, between nodes, of current
quota values. Shared lock -> LVB read. EX lock -> LVB write. */
struct gfs_glock *qd_gl; /* glock for this quota */
struct gfs_quota_lvb qd_qb; /* LVB (limit/warn/value) */
unsigned long qd_last_warn; /* Jiffies of last warning to user */
};
/*
* Log Buffer descriptor structure.
* One for each block buffer recorded in the log.
* When beginning a new transaction, GFS pre-allocates a number of these,
* and puts them on transaction's tr_free_bufs list.
* Logged buffers are of two types:
* 1). Exact copies of buffers to be written to in-place location in FS.
* 2). Log-only buffers such as log headers and control blocks (e.g. tags).
* A gfs_log_buf is required for both types; the ones for log-only buffers
* contain NULL in lb_unlock, and get cleaned up after the log write.
* lb_bh is a "fake" buffer head that directs Linux block I/O to write the buf
* to the on-disk log location, rather than the on-disk in-place location.
* Used for both types.
* lb_unlock points to the "real" buffer head that directs Linux to write the
* buf to its regular on-disk in-place filesystem location. Once the commit
* to the on-disk log is finished, GFS unlocks the "real" buffer so it can be
* written to in-place block, or modified by another transaction.
* Used only for type 1).
*/
struct gfs_log_buf {
/* Link to one of the transaction structure's lists */
struct list_head lb_list; /* Link to tr_free_bufs or tr_list */
struct buffer_head lb_bh; /* "Fake" bh; for the log block */
struct buffer_head *lb_unlock; /* "Real" bh; for the in-place block */
};
/*
* Transaction structure
* One for each transaction
* This coordinates the logging and flushing of written metadata.
*/
#define TRF_LOG_DUMP (0x00000001)
#define TRF_DUMMY (0x00000002)
struct gfs_trans {
/* Link to various lists */
struct list_head tr_list; /* Superblk's incore trans or AIL list*/
/* Initial creation stuff */
char *tr_file; /* Debug info: .c file creating trans */
unsigned int tr_line; /* Debug info: codeline creating trans */
/* Reservations for on-disk space in journal.
Meta blocks are copies of in-place filesystem blocks.
Extra blocks are log-only (log header and control blocks) */
unsigned int tr_mblks_asked; /* # of meta log blocks requested */
unsigned int tr_eblks_asked; /* # of extra log blocks requested */
unsigned int tr_seg_reserved; /* # of segments actually reserved */
struct gfs_holder *tr_t_gh; /* Glock holder for this transaction */
/* Stuff filled in during creation */
unsigned int tr_flags; /* TRF_... */
struct list_head tr_elements; /* List of this trans' log elements */
/* Stuff modified during the commit */
/* When creating a new transaction, GFS pre-allocates as many of
these buffers and descriptor structures as it might need for
all loggable filesystem (meta)data, and log-control (log-only, not
going to filesystem in-place location) data going to on-disk log.
It keeps them on these "free" lists until they get used (and linked
into tr_bufs list, below) or "refunded" if not needed. */
unsigned int tr_num_free_bufs; /* List of free gfs_log_buf structs */
struct list_head tr_free_bufs; /* .. 1 for each log block */
unsigned int tr_num_free_bmem; /* List of free fs-block-size buffers */
struct list_head tr_free_bmem; /* .. for log-only (e.g. tag) blocks */
/* Logged transaction starts with a (first) log header at a segment
boundary, and fills contiguous blocks after that. Each segment
boundary block gets another log header. */
uint64_t tr_log_head; /* The next log block # to fill */
uint64_t tr_first_head; /* Trans' first log header's block # */
/* gfs_log_buf structs move from tr_free_bufs to here when being used */
struct list_head tr_bufs; /* List of buffers going to the log */
/* Stuff that's part of the Active Items List (AIL) */
struct list_head tr_ail_bufs; /* List of buffers on AIL list */
/* # log elements of various types on tr_elements list */
unsigned int tr_num_gl; /* Glocks */
unsigned int tr_num_buf; /* Buffers */
unsigned int tr_num_iul; /* Unlinked inodes */
unsigned int tr_num_ida; /* De-allocated inodes */
unsigned int tr_num_q; /* Quotas */
};
#define GFS_GLOCKD_DEFAULT (1)
#define GFS_GLOCKD_MAX (32)
#define GFS_QUOTA_DEFAULT GFS_QUOTA_OFF
#define GFS_QUOTA_OFF 0
#define GFS_QUOTA_ACCOUNT 1
#define GFS_QUOTA_ON 2
#define GFS_DATA_DEFAULT GFS_DATA_ORDERED
#define GFS_DATA_WRITEBACK 1
#define GFS_DATA_ORDERED 2
struct gfs_args {
char ar_lockproto[GFS_LOCKNAME_LEN]; /* The name of the Lock Protocol */
char ar_locktable[GFS_LOCKNAME_LEN]; /* The name of the Lock Table */
char ar_hostdata[GFS_LOCKNAME_LEN]; /* The host specific data */
int ar_spectator; /* Don't get a journal because we're always RO. */
/*
* GFS can invoke some flock and disk caching optimizations if it is
* not in a cluster, i.e. is a local filesystem. The chosen lock
* module tells GFS, at mount time, if it supports clustering.
* The nolock module is the only one that does not support clustering;
* it sets to TRUE the local_fs field in the struct lm_lockops.
* GFS can either optimize, or ignore the opportunity.
* The user controls behavior via the following mount options.
*/
int ar_ignore_local_fs; /* Don't optimize even if local_fs is TRUE */
int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
int ar_localcaching; /* Local-style caching (dangerous on multihost) */
int ar_oopses_ok; /* Allow oopses */
int ar_debug; /* Oops on errors instead of trying to be graceful */
int ar_upgrade; /* Upgrade ondisk/multihost format */
unsigned int ar_num_glockd; /* # of glock cleanup daemons to run
(more daemons => faster cleanup) */
int ar_posix_acls; /* Enable posix acls */
int ar_suiddir; /* suiddir support */
int ar_noquota; /* Turn off quota support */
};
struct gfs_tune {
spinlock_t gt_spin;
unsigned int gt_ilimit1;
unsigned int gt_ilimit1_tries;
unsigned int gt_ilimit1_min;
unsigned int gt_ilimit2;
unsigned int gt_ilimit2_tries;
unsigned int gt_ilimit2_min;
unsigned int gt_demote_secs; /* Cache retention for unheld glock */
unsigned int gt_incore_log_blocks;
unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
unsigned int gt_depend_secs;
/* How often various daemons run (seconds) */
unsigned int gt_scand_secs; /* Find unused glocks and inodes */
unsigned int gt_recoverd_secs; /* Recover journal of crashed node */
unsigned int gt_logd_secs; /* Update log tail as AIL flushes */
unsigned int gt_quotad_secs; /* Sync changes to quota file, clean*/
unsigned int gt_inoded_secs; /* Toss unused inodes */
unsigned int gt_glock_purge; /* Purge glock */
unsigned int gt_quota_simul_sync; /* Max # quotavals to sync at once */
unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
unsigned int gt_atime_quantum; /* Min secs between atime updates */
unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
unsigned int gt_quota_scale_num; /* Numerator */
unsigned int gt_quota_scale_den; /* Denominator */
unsigned int gt_quota_enforce;
unsigned int gt_quota_account;
unsigned int gt_new_files_jdata;
unsigned int gt_new_files_directio;
unsigned int gt_max_atomic_write; /* Split large writes into this size*/
unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
unsigned int gt_lockdump_size;
unsigned int gt_stall_secs; /* Detects trouble! */
unsigned int gt_complain_secs;
unsigned int gt_reclaim_limit; /* Max # glocks in reclaim list */
unsigned int gt_entries_per_readdir;
unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
unsigned int gt_statfs_slots;
unsigned int gt_max_mhc; /* Max # of meta headers in mhc cache */
unsigned int gt_greedy_default;
unsigned int gt_greedy_quantum;
unsigned int gt_greedy_max;
unsigned int gt_rgrp_try_threshold;
unsigned int gt_statfs_fast;
+ unsigned int gt_max_rgrp_free_mdata; /* Max # of rgrps to free metadata from */
};
/*
* One bucket of the filesystem's sd_gl_hash glock hash table.
*
* A gfs_glock links into a bucket's list via glock's gl_list member.
*
*/
struct gfs_gl_hash_bucket {
rwlock_t hb_lock; /* Protects list */
struct list_head hb_list; /* List of glocks in this bucket */
};
/*
* "Super Block" Data Structure
* One per mounted filesystem.
* This is the big instance structure that ties everything together for
* a given mounted filesystem. Each GFS mount has its own, supporting
* mounts of multiple GFS filesystems on each node.
* Pointer to this is usually seen as "sdp" throughout code.
* This is a very large structure, as structures go, in part because it
* contains arrays of hash buckets for various in-core caches.
*/
#define SDF_JOURNAL_LIVE (0) /* Journaling is active (journal is writeable)*/
#define SDF_SHUTDOWN (1) /* FS abnormaly shutdown */
/* (Re)mount options from Linux VFS */
#define SDF_NOATIME (8) /* Don't change access time */
#define SDF_ROFS (9) /* Read-only mode */
/* Journal log dump support */
#define SDF_NEED_LOG_DUMP (10) /* Need to rewrite unlink and quota tags */
#define SDF_FOUND_UL_DUMP (11) /* Recovery found unlinked tags */
#define SDF_FOUND_Q_DUMP (12) /* Recovery found qutoa tags */
#define SDF_IN_LOG_DUMP (13) /* Serializes log dumps */
/* Glock cache */
#define GFS_GL_HASH_SHIFT (13) /* # hash buckets = 8K */
#define GFS_GL_HASH_SIZE (1 << GFS_GL_HASH_SHIFT)
#define GFS_GL_HASH_MASK (GFS_GL_HASH_SIZE - 1)
/* Meta header cache */
#define GFS_MHC_HASH_SHIFT (10) /* # hash buckets = 1K */
#define GFS_MHC_HASH_SIZE (1 << GFS_MHC_HASH_SHIFT)
#define GFS_MHC_HASH_MASK (GFS_MHC_HASH_SIZE - 1)
/* Dependency cache */
#define GFS_DEPEND_HASH_SHIFT (10) /* # hash buckets = 1K */
#define GFS_DEPEND_HASH_SIZE (1 << GFS_DEPEND_HASH_SHIFT)
#define GFS_DEPEND_HASH_MASK (GFS_DEPEND_HASH_SIZE - 1)
struct gfs_sbd {
struct gfs_sb sd_sb; /* GFS on-disk Super Block image */
struct super_block *sd_vfs; /* Linux VFS device independent sb */
struct gfs_args sd_args; /* Mount arguments */
unsigned long sd_flags; /* SDF_... see above */
struct gfs_tune sd_tune; /* Filesystem tuning structure */
/* statfs */
struct inode *sd_statfs_inode;
spinlock_t sd_statfs_spin;
struct gfs_statfs_change_host sd_statfs_master;
struct gfs_statfs_change_host sd_statfs_local;
unsigned long sd_statfs_sync_time;
/* Resource group stuff */
struct gfs_inode *sd_riinode; /* Resource Index (rindex) inode */
uint64_t sd_riinode_vn; /* Resource Index version # (detects
whether new rgrps have been added) */
struct list_head sd_rglist; /* List of all resource groups,
on-disk order */
struct semaphore sd_rindex_lock;/* Serializes RIndex rereads */
struct list_head sd_rg_mru_list;/* List of all resource groups,
most-recently-used (MRU) order */
spinlock_t sd_rg_mru_lock; /* Protect mru list */
struct list_head sd_rg_recent; /* List of rgrps from which blocks
were recently allocated */
spinlock_t sd_rg_recent_lock; /* Protect recent list */
struct gfs_rgrpd *sd_rg_forward;/* Next rgrp from which to attempt
a block alloc */
spinlock_t sd_rg_forward_lock; /* Protect forward pointer */
unsigned int sd_rgcount; /* Total # of resource groups */
/* Constants computed on mount */
/* "bb" == "basic block" == 512Byte sector */
uint32_t sd_fsb2bb; /* # 512B basic blocks in a FS block */
uint32_t sd_fsb2bb_shift; /* Shift sector # to the right by
this to get FileSystem block addr */
uint32_t sd_diptrs; /* Max # of block pointers in a dinode */
uint32_t sd_inptrs; /* Max # of block pointers in an indirect blk */
uint32_t sd_jbsize; /* Payload size (bytes) of a journaled metadata
block (GFS journals all meta blocks) */
uint32_t sd_hash_bsize; /* sizeof(exhash hash block) */
uint32_t sd_hash_bsize_shift;
uint32_t sd_hash_ptrs; /* Number of points in a hash block */
uint32_t sd_max_dirres; /* Max blocks needed to add a directory entry */
uint32_t sd_max_height; /* Max height of a file's tree */
uint64_t sd_heightsize[GFS_MAX_META_HEIGHT];
uint32_t sd_max_jheight; /* Max height, journaled file's tree */
uint64_t sd_jheightsize[GFS_MAX_META_HEIGHT];
/* Lock Stuff */
/* Glock cache (all glocks currently held by this node for this FS) */
struct gfs_gl_hash_bucket sd_gl_hash[GFS_GL_HASH_SIZE];
/* Glock reclaim support for scand and glockd */
struct list_head sd_reclaim_list; /* List of glocks to reclaim */
spinlock_t sd_reclaim_lock;
wait_queue_head_t sd_reclaim_wchan;
atomic_t sd_reclaim_count; /* # glocks on reclaim list */
/* Lock module tells us if we're first-to-mount,
which journal to use, etc. */
struct lm_lockstruct sd_lockstruct; /* Info provided by lock module */
/* Other caches */
/* Meta-header cache (incore copies of on-disk meta headers) */
struct list_head sd_mhc[GFS_MHC_HASH_SIZE]; /* hash buckets */
struct list_head sd_mhc_single; /* Non-hashed list of all MHCs */
spinlock_t sd_mhc_lock;
atomic_t sd_mhc_count; /* # MHCs in cache */
/* Dependency cache */
struct list_head sd_depend[GFS_DEPEND_HASH_SIZE]; /* Hash buckets */
spinlock_t sd_depend_lock;
atomic_t sd_depend_count; /* # dependencies in cache */
/* LIVE inter-node lock indicates that FS is mounted on at least
one node */
struct gfs_holder sd_live_gh; /* Glock holder for LIVE lock */
/* For quiescing the filesystem */
struct gfs_holder sd_freeze_gh;
struct semaphore sd_freeze_lock;
unsigned int sd_freeze_count;
/* Inode Stuff */
struct gfs_inode *sd_rooti; /* FS's root inode */
/* Only 1 node at a time may rename (e.g. mv) directory from
one directory to another. */
struct gfs_glock *sd_rename_gl; /* Rename glock */
/* Daemon stuff */
/* Scan for glocks and inodes to toss from memory */
struct task_struct *sd_scand_process; /* Scand places on reclaim list*/
struct task_struct *sd_glockd_process[GFS_GLOCKD_MAX];
unsigned int sd_glockd_num; /* # of glockd procs to do reclaiming*/
/* Recover journal of a crashed node */
struct task_struct *sd_recoverd_process;
/* Update log tail as AIL gets flushed to in-place on-disk blocks */
struct task_struct *sd_logd_process;
/* Sync quota updates to disk, and clean up unused quota structs */
struct task_struct *sd_quotad_process;
/* Clean up unused inode structures */
struct task_struct *sd_inoded_process;
/* Log stuff */
/* Transaction lock protects the following from one another:
normal write transaction, journal replay (recovery), fs upgrade,
fs read-only => read/write and read/write => read-only conversions.
Also, acquiring the transaction lock in a state other than shared
causes all other machines in the cluster to sync out their dirty
data, mark their journal as being clean, and prevent any new FS
modifications from occuring (i.e. quiesces the FS). */
struct gfs_glock *sd_trans_gl; /* Transaction glock structure */
struct gfs_inode *sd_jiinode; /* Journal index inode */
uint64_t sd_jiinode_vn; /* Journal index version # (detects
if new journals have been added) */
unsigned int sd_journals; /* Number of journals in the FS */
struct gfs_jindex *sd_jindex; /* Array of journal descriptors */
struct semaphore sd_jindex_lock;
unsigned long sd_jindex_refresh_time; /* Poll for new journals (secs) */
struct gfs_jindex sd_jdesc; /* This machine's journal descriptor */
struct gfs_holder sd_journal_gh; /* This machine's jrnl glock holder */
uint64_t sd_sequence; /* Assigned to xactions in order they commit */
uint64_t sd_log_head; /* Block number of next journal write */
uint64_t sd_log_wrap;
spinlock_t sd_log_seg_lock;
unsigned int sd_log_seg_free; /* # of free segments in the log */
unsigned int sd_log_seg_ail2; /* # of freeable segments in the log */
struct list_head sd_log_seg_list;
wait_queue_head_t sd_log_seg_wait;
/* "Active Items List" of transactions that have been flushed to
on-disk log, and are waiting for flush to in-place on-disk blocks */
struct list_head sd_log_ail; /* "next" is head, "prev" is tail */
/* Transactions committed incore, but not yet flushed to on-disk log */
struct list_head sd_log_incore; /* "Next" is newest, "prev" is oldest */
unsigned int sd_log_buffers; /* # of buffers in the incore log */
struct rw_semaphore sd_log_lock; /* Lock for access to log values */
uint64_t sd_log_dump_last;
uint64_t sd_log_dump_last_wrap;
/*
* Unlinked inode crap.
* List includes newly created, not-yet-linked inodes,
* as well as inodes that have been unlinked and are waiting
* to be de-allocated.
*/
struct list_head sd_unlinked_list; /* List of unlinked inodes */
spinlock_t sd_unlinked_lock; /* Protects list and members */
atomic_t sd_unlinked_ic_count;
atomic_t sd_unlinked_od_count;
/* Quota crap */
struct list_head sd_quota_list; /* List of all gfs_quota_data structs */
spinlock_t sd_quota_lock;
atomic_t sd_quota_count; /* # quotas on sd_quota_list */
atomic_t sd_quota_od_count; /* # quotas waiting for sync to
special on-disk quota file */
struct gfs_inode *sd_qinode; /* Special on-disk quota file */
uint64_t sd_quota_sync_gen; /* Generation, incr when sync to file */
unsigned long sd_quota_sync_time; /* Jiffies, last sync to quota file */
/* License crap */
struct gfs_inode *sd_linode; /* Special on-disk license file */
/* Recovery stuff */
/* Lock module tells GFS, via callback, when a journal needs recovery.
It stays on this list until recovery daemon performs recovery. */
struct list_head sd_dirty_j; /* List of dirty journals */
spinlock_t sd_dirty_j_lock; /* Protects list */
/* Statistics for 3 possible recovery actions for each buffer in log,
determined by comparing generation #s of logged block and
in-place block. Scope of stats is for one journal. */
unsigned int sd_recovery_replays; /* newer than in-place; copy it */
unsigned int sd_recovery_skips; /* older than in-place; ignore it */
unsigned int sd_recovery_sames; /* same as in-place; ignore it */
/* Counters */
/* current quantities of various things */
atomic_t sd_glock_count; /* # of gfs_glock structs alloc'd */
atomic_t sd_glock_held_count; /* # of glocks locked by this node */
atomic_t sd_inode_count; /* # of gfs_inode structs alloc'd */
atomic_t sd_bufdata_count; /* # of gfs_bufdata structs alloc'd */
atomic_t sd_fh2dentry_misses; /* total # get_dentry misses */
atomic_t sd_reclaimed; /* total # glocks reclaimed since mount */
/* total lock-related calls handled since mount */
atomic_t sd_glock_nq_calls;
atomic_t sd_glock_dq_calls;
atomic_t sd_glock_prefetch_calls;
atomic_t sd_lm_lock_calls;
atomic_t sd_lm_unlock_calls;
atomic_t sd_lm_callbacks;
atomic_t sd_lm_outstanding;
atomic_t sd_bio_reads;
atomic_t sd_bio_writes;
atomic_t sd_bio_outstanding;
/* total calls from Linux VFS handled since mount */
atomic_t sd_ops_address;
atomic_t sd_ops_dentry;
atomic_t sd_ops_export;
atomic_t sd_ops_file;
atomic_t sd_ops_inode;
atomic_t sd_ops_super;
atomic_t sd_ops_vm;
char sd_fsname[256];
char sd_table_name[256];
char sd_proto_name[256];
struct kobject sd_kobj;
/* Debugging crud */
unsigned long sd_last_warning;
spinlock_t sd_ail_lock;
struct list_head sd_recovery_bufs;
struct list_head sd_list;
};
#endif /* __INCORE_DOT_H__ */
diff --git a/gfs-kernel/src/gfs/ioctl.c b/gfs-kernel/src/gfs/ioctl.c
index d5489b532..e01211903 100644
--- a/gfs-kernel/src/gfs/ioctl.c
+++ b/gfs-kernel/src/gfs/ioctl.c
@@ -1,1605 +1,1609 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/spinlock.h>
#include <linux/semaphore.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
#include <asm/uaccess.h>
#include <linux/compat.h>
#include "gfs_ioctl.h"
#include "gfs.h"
#include "bmap.h"
#include "dio.h"
#include "dir.h"
#include "eattr.h"
#include "file.h"
#include "glock.h"
#include "glops.h"
#include "inode.h"
#include "ioctl.h"
#include "log.h"
#include "quota.h"
#include "rgrp.h"
#include "super.h"
#include "trans.h"
typedef int (*gi_filler_t) (struct gfs_inode *ip,
struct gfs_ioctl *gi,
char *buf,
unsigned int size,
unsigned int *count);
#define ARG_SIZE (32)
/**
* gi_skeleton - Setup a buffer that functions can print into
* @ip:
* @gi:
* @filler:
*
* Returns: -errno or count of bytes copied to userspace
*/
static int
gi_skeleton(struct gfs_inode *ip, struct gfs_ioctl *gi,
gi_filler_t filler)
{
unsigned int size = gfs_tune_get(ip->i_sbd, gt_lockdump_size);
char *buf;
unsigned int count = 0;
int error;
if (size > gi->gi_size)
size = gi->gi_size;
buf = kmalloc(size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
error = filler(ip, gi, buf, size, &count);
if (error)
goto out;
if (copy_to_user(gi->gi_data, buf, count + 1))
error = -EFAULT;
else
error = count + 1;
out:
kfree(buf);
return error;
}
/**
* gi_get_cookie - Return the "cookie" (identifying string) for a
* filesystem mount
* @ip:
* @gi:
* @buf:
* @size:
* @count:
*
* Returns: errno
*/
static int
gi_get_cookie(struct gfs_inode *ip,
struct gfs_ioctl *gi,
char *buf,
unsigned int size,
unsigned int *count)
{
int error = -ENOBUFS;
if (gi->gi_argc != 1)
return -EINVAL;
gfs_printf("version 0\n");
gfs_printf("%lu", (unsigned long)ip->i_sbd);
error = 0;
out:
return error;
}
/**
* gi_get_super - Return the "struct gfs_sb" for a filesystem
* @sdp:
* @gi:
*
* Returns: errno
*/
static int
gi_get_super(struct gfs_sbd *sdp, struct gfs_ioctl *gi)
{
struct gfs_holder sb_gh;
struct buffer_head *bh;
struct gfs_sb *sb;
int error;
if (gi->gi_argc != 1)
return -EINVAL;
if (gi->gi_size != sizeof(struct gfs_sb))
return -EINVAL;
sb = kmalloc(sizeof(struct gfs_sb), GFP_KERNEL);
if (!sb)
return -ENOMEM;
error = gfs_glock_nq_num(sdp,
GFS_SB_LOCK, &gfs_meta_glops,
LM_ST_SHARED, 0, &sb_gh);
if (error)
goto out;
error = gfs_dread(sb_gh.gh_gl, GFS_SB_ADDR >> sdp->sd_fsb2bb_shift,
DIO_START | DIO_WAIT, &bh);
if (error) {
gfs_glock_dq_uninit(&sb_gh);
goto out;
}
gfs_sb_in(sb, bh->b_data);
brelse(bh);
gfs_glock_dq_uninit(&sb_gh);
if (copy_to_user(gi->gi_data, sb,
sizeof(struct gfs_sb)))
error = -EFAULT;
else
error = sizeof(struct gfs_sb);
out:
kfree(sb);
return error;
}
/**
* gi_get_args - Return the mount arguments
* @ip:
* @gi:
* @buf:
* @size:
* @count:
*
* Returns: errno
*/
static int
gi_get_args(struct gfs_inode *ip,
struct gfs_ioctl *gi,
char *buf,
unsigned int size,
unsigned int *count)
{
struct gfs_args *args = &ip->i_sbd->sd_args;
int error = -ENOBUFS;
if (gi->gi_argc != 1)
return -EINVAL;
gfs_printf("version 0\n");
gfs_printf("lockproto %s\n", args->ar_lockproto);
gfs_printf("locktable %s\n", args->ar_locktable);
gfs_printf("hostdata %s\n", args->ar_hostdata);
gfs_printf("ignore_local_fs %d\n", args->ar_ignore_local_fs);
gfs_printf("localcaching %d\n", args->ar_localcaching);
gfs_printf("localflocks %d\n", args->ar_localflocks);
gfs_printf("oopses_ok %d\n", args->ar_oopses_ok);
gfs_printf("upgrade %d\n", args->ar_upgrade);
gfs_printf("num_glockd %u\n", args->ar_num_glockd);
gfs_printf("posix_acls %d\n", args->ar_posix_acls);
gfs_printf("suiddir %d\n", args->ar_suiddir);
error = 0;
out:
return error;
}
/**
* gi_get_lockstruct - Return the information in the FS' lockstruct
* @ip:
* @gi:
* @buf:
* @size:
* @count:
*
* Returns: errno
*/
static int
gi_get_lockstruct(struct gfs_inode *ip,
struct gfs_ioctl *gi,
char *buf,
unsigned int size,
unsigned int *count)
{
struct lm_lockstruct *ls = &ip->i_sbd->sd_lockstruct;
int error = -ENOBUFS;
if (gi->gi_argc != 1)
return -EINVAL;
gfs_printf("version 0\n");
gfs_printf("jid %u\n", ls->ls_jid);
gfs_printf("first %u\n", ls->ls_first);
gfs_printf("lvb_size %u\n", ls->ls_lvb_size);
gfs_printf("flags %d\n", ls->ls_flags);
error = 0;
out:
return error;
}
/**
* gi_get_stat_gfs - Return a filesystem's space usage information
* @ip:
* @gi:
* @buf:
* @size:
* @count:
*
* Returns: errno
*/
static int
gi_get_stat_gfs(struct gfs_inode *ip,
struct gfs_ioctl *gi,
char *buf,
unsigned int size,
unsigned int *count)
{
struct gfs_stat_gfs sg;
int error;
if (gi->gi_argc != 1)
return -EINVAL;
error = gfs_stat_gfs(ip->i_sbd, &sg, TRUE);
if (error)
return error;
error = -ENOBUFS;
gfs_printf("version 0\n");
gfs_printf("bsize %u\n", ip->i_sbd->sd_sb.sb_bsize);
gfs_printf("total_blocks %"PRIu64"\n", sg.sg_total_blocks);
gfs_printf("free %"PRIu64"\n", sg.sg_free);
gfs_printf("used_dinode %"PRIu64"\n", sg.sg_used_dinode);
gfs_printf("free_dinode %"PRIu64"\n", sg.sg_free_dinode);
gfs_printf("used_meta %"PRIu64"\n", sg.sg_used_meta);
gfs_printf("free_meta %"PRIu64"\n", sg.sg_free_meta);
error = 0;
out:
return error;
}
/**
* handle_roll - Read a atomic_t as an unsigned int
* @a: a counter
*
* if @a is negative, reset it to zero
*
* Returns: the value of the counter
*/
static unsigned int
handle_roll(atomic_t *a)
{
int x = atomic_read(a);
if (x < 0) {
atomic_set(a, 0);
return 0;
}
return (unsigned int)x;
}
/**
* gi_get_counters - Return usage counters
* @ip:
* @gi:
* @buf:
* @size:
* @count:
*
* Returns: errno
*/
static int
gi_get_counters(struct gfs_inode *ip,
struct gfs_ioctl *gi,
char *buf,
unsigned int size,
unsigned int *count)
{
struct gfs_sbd *sdp = ip->i_sbd;
int error = -ENOBUFS;
if (gi->gi_argc != 1)
return -EINVAL;
gfs_printf("version 0\n");
gfs_printf("sd_glock_count:locks::%d\n",
atomic_read(&sdp->sd_glock_count));
gfs_printf("sd_glock_held_count:locks held::%d\n",
atomic_read(&sdp->sd_glock_held_count));
gfs_printf("sd_freeze_count:freeze count::%d\n",
sdp->sd_freeze_count);
gfs_printf("sd_inode_count:incore inodes::%d\n",
atomic_read(&sdp->sd_inode_count));
gfs_printf("sd_bufdata_count:metadata buffers::%d\n",
atomic_read(&sdp->sd_bufdata_count));
gfs_printf("sd_unlinked_ic_count:unlinked inodes::%d\n",
atomic_read(&sdp->sd_unlinked_ic_count));
gfs_printf("sd_quota_count:quota IDs::%d\n",
atomic_read(&sdp->sd_quota_count));
gfs_printf("sd_log_buffers:incore log buffers::%u\n",
sdp->sd_log_buffers);
gfs_printf("sd_log_seg_free:log segments free::%u\n",
sdp->sd_log_seg_free);
gfs_printf("ji_nsegment:log segments total::%u\n",
sdp->sd_jdesc.ji_nsegment);
gfs_printf("sd_mhc_count:meta header cache entries::%d\n",
atomic_read(&sdp->sd_mhc_count));
gfs_printf("sd_depend_count:glock dependencies::%d\n",
atomic_read(&sdp->sd_depend_count));
gfs_printf("sd_reclaim_count:glocks on reclaim list::%d\n",
atomic_read(&sdp->sd_reclaim_count));
gfs_printf("sd_log_wrap:log wraps::%"PRIu64"\n",
sdp->sd_log_wrap);
gfs_printf("sd_lm_outstanding:outstanding LM calls::%d\n",
atomic_read(&sdp->sd_lm_outstanding));
gfs_printf("sd_bio_outstanding:outstanding BIO calls::%u\n",
atomic_read(&sdp->sd_bio_outstanding));
gfs_printf("sd_fh2dentry_misses:fh2dentry misses:diff:%u\n",
handle_roll(&sdp->sd_fh2dentry_misses));
gfs_printf("sd_reclaimed:glocks reclaimed:diff:%u\n",
handle_roll(&sdp->sd_reclaimed));
gfs_printf("sd_glock_nq_calls:glock nq calls:diff:%u\n",
handle_roll(&sdp->sd_glock_nq_calls));
gfs_printf("sd_glock_dq_calls:glock dq calls:diff:%u\n",
handle_roll(&sdp->sd_glock_dq_calls));
gfs_printf("sd_glock_prefetch_calls:glock prefetch calls:diff:%u\n",
handle_roll(&sdp->sd_glock_prefetch_calls));
gfs_printf("sd_lm_lock_calls:lm_lock calls:diff:%u\n",
handle_roll(&sdp->sd_lm_lock_calls));
gfs_printf("sd_lm_unlock_calls:lm_unlock calls:diff:%u\n",
handle_roll(&sdp->sd_lm_unlock_calls));
gfs_printf("sd_lm_callbacks:lm callbacks:diff:%u\n",
handle_roll(&sdp->sd_lm_callbacks));
gfs_printf("sd_ops_address:address operations:diff:%u\n",
handle_roll(&sdp->sd_ops_address));
gfs_printf("sd_ops_dentry:dentry operations:diff:%u\n",
handle_roll(&sdp->sd_ops_dentry));
gfs_printf("sd_ops_export:export operations:diff:%u\n",
handle_roll(&sdp->sd_ops_export));
gfs_printf("sd_ops_file:file operations:diff:%u\n",
handle_roll(&sdp->sd_ops_file));
gfs_printf("sd_ops_inode:inode operations:diff:%u\n",
handle_roll(&sdp->sd_ops_inode));
gfs_printf("sd_ops_super:super operations:diff:%u\n",
handle_roll(&sdp->sd_ops_super));
gfs_printf("sd_ops_vm:vm operations:diff:%u\n",
handle_roll(&sdp->sd_ops_vm));
gfs_printf("sd_bio_reads:block I/O reads:diff:%u\n",
handle_roll(&sdp->sd_bio_reads) >>
(sdp->sd_sb.sb_bsize_shift - 9));
gfs_printf("sd_bio_writes:block I/O writes:diff:%u\n",
handle_roll(&sdp->sd_bio_writes) >>
(sdp->sd_sb.sb_bsize_shift - 9));
error = 0;
out:
return error;
}
/**
* gi_get_tune - Return current values of the tuneable parameters
* @ip:
* @gi:
* @buf:
* @size:
* @count:
*
* Returns: errno
*/
static int
gi_get_tune(struct gfs_inode *ip,
struct gfs_ioctl *gi,
char *buf,
unsigned int size,
unsigned int *count)
{
struct gfs_tune *gt = &ip->i_sbd->sd_tune;
int error = -ENOBUFS;
if (gi->gi_argc != 1)
return -EINVAL;
spin_lock(&gt->gt_spin);
gfs_printf("version 0\n");
gfs_printf("ilimit1 %u\n", gt->gt_ilimit1);
gfs_printf("ilimit1_tries %u\n", gt->gt_ilimit1_tries);
gfs_printf("ilimit1_min %u\n", gt->gt_ilimit1_min);
gfs_printf("ilimit2 %u\n", gt->gt_ilimit2);
gfs_printf("ilimit2_tries %u\n", gt->gt_ilimit2_tries);
gfs_printf("ilimit2_min %u\n", gt->gt_ilimit2_min);
gfs_printf("demote_secs %u\n", gt->gt_demote_secs);
gfs_printf("incore_log_blocks %u\n", gt->gt_incore_log_blocks);
gfs_printf("jindex_refresh_secs %u\n", gt->gt_jindex_refresh_secs);
gfs_printf("depend_secs %u\n", gt->gt_depend_secs);
gfs_printf("scand_secs %u\n", gt->gt_scand_secs);
gfs_printf("recoverd_secs %u\n", gt->gt_recoverd_secs);
gfs_printf("logd_secs %u\n", gt->gt_logd_secs);
gfs_printf("quotad_secs %u\n", gt->gt_quotad_secs);
gfs_printf("inoded_secs %u\n", gt->gt_inoded_secs);
gfs_printf("glock_purge %u\n", gt->gt_glock_purge);
gfs_printf("quota_simul_sync %u\n", gt->gt_quota_simul_sync);
gfs_printf("quota_warn_period %u\n", gt->gt_quota_warn_period);
gfs_printf("atime_quantum %u\n", gt->gt_atime_quantum);
gfs_printf("quota_quantum %u\n", gt->gt_quota_quantum);
gfs_printf("quota_scale_num %u\n", gt->gt_quota_scale_num);
gfs_printf("quota_scale_den %u\n", gt->gt_quota_scale_den);
gfs_printf("quota_enforce %u\n", gt->gt_quota_enforce);
gfs_printf("quota_account %u\n", gt->gt_quota_account);
gfs_printf("new_files_jdata %u\n", gt->gt_new_files_jdata);
gfs_printf("new_files_directio %u\n", gt->gt_new_files_directio);
gfs_printf("max_atomic_write %u\n", gt->gt_max_atomic_write);
gfs_printf("max_readahead %u\n", gt->gt_max_readahead);
gfs_printf("lockdump_size %u\n", gt->gt_lockdump_size);
gfs_printf("stall_secs %u\n", gt->gt_stall_secs);
gfs_printf("complain_secs %u\n", gt->gt_complain_secs);
gfs_printf("reclaim_limit %u\n", gt->gt_reclaim_limit);
gfs_printf("entries_per_readdir %u\n", gt->gt_entries_per_readdir);
gfs_printf("prefetch_secs %u\n", gt->gt_prefetch_secs);
gfs_printf("statfs_slots %u\n", gt->gt_statfs_slots);
gfs_printf("max_mhc %u\n", gt->gt_max_mhc);
gfs_printf("greedy_default %u\n", gt->gt_greedy_default);
gfs_printf("greedy_quantum %u\n", gt->gt_greedy_quantum);
gfs_printf("greedy_max %u\n", gt->gt_greedy_max);
gfs_printf("rgrp_try_threshold %u\n", gt->gt_rgrp_try_threshold);
gfs_printf("statfs_fast %u\n", gt->gt_statfs_fast);
+ gfs_printf("max_rgrp_free_mdata %u\n", gt->gt_max_rgrp_free_mdata);
error = 0;
out:
spin_unlock(&gt->gt_spin);
return error;
}
#define tune_set(f, v) \
do { \
spin_lock(&gt->gt_spin); \
gt->f = (v); \
spin_unlock(&gt->gt_spin); \
} while (0)
/**
* gi_set_tune - Set a tuneable parameter
* @sdp:
* @gi:
*
* Returns: errno
*/
static int
gi_set_tune(struct gfs_sbd *sdp, struct gfs_ioctl *gi, int from_user)
{
struct gfs_tune *gt = &sdp->sd_tune;
char param[ARG_SIZE], value[ARG_SIZE];
unsigned int x;
int error;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (gi->gi_argc != 3)
return -EINVAL;
if (from_user) {
if (strncpy_from_user(param, gi->gi_argv[1], ARG_SIZE) < 0)
return -EFAULT;
} else {
strncpy(param, gi->gi_argv[1], ARG_SIZE);
}
param[ARG_SIZE - 1] = 0;
if (from_user) {
if (strncpy_from_user(value, gi->gi_argv[2], ARG_SIZE) < 0)
return -EFAULT;
} else {
strncpy(value, gi->gi_argv[2], ARG_SIZE);
}
value[ARG_SIZE - 1] = 0;
if (strcmp(param, "ilimit1") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_ilimit1, x);
} else if (strcmp(param, "ilimit1_tries") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_ilimit1_tries, x);
} else if (strcmp(param, "ilimit1_min") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_ilimit1_min, x);
} else if (strcmp(param, "ilimit2") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_ilimit2, x);
} else if (strcmp(param, "ilimit2_tries") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_ilimit2_tries, x);
} else if (strcmp(param, "ilimit2_min") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_ilimit2_min, x);
} else if (strcmp(param, "demote_secs") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_demote_secs, x);
} else if (strcmp(param, "incore_log_blocks") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_incore_log_blocks, x);
} else if (strcmp(param, "jindex_refresh_secs") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_jindex_refresh_secs, x);
} else if (strcmp(param, "depend_secs") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_depend_secs, x);
} else if (strcmp(param, "scand_secs") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_scand_secs, x);
wake_up_process(sdp->sd_scand_process);
} else if (strcmp(param, "recoverd_secs") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_recoverd_secs, x);
wake_up_process(sdp->sd_recoverd_process);
} else if (strcmp(param, "logd_secs") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_logd_secs, x);
wake_up_process(sdp->sd_logd_process);
} else if (strcmp(param, "quotad_secs") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_quotad_secs, x);
wake_up_process(sdp->sd_quotad_process);
} else if (strcmp(param, "inoded_secs") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_inoded_secs, x);
wake_up_process(sdp->sd_inoded_process);
} else if (strcmp(param, "glock_purge") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_glock_purge, x);
} else if (strcmp(param, "quota_simul_sync") == 0) {
if (sscanf(value, "%u", &x) != 1 || !x)
return -EINVAL;
tune_set(gt_quota_simul_sync, x);
} else if (strcmp(param, "quota_warn_period") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_quota_warn_period, x);
} else if (strcmp(param, "atime_quantum") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_atime_quantum, x);
} else if (strcmp(param, "quota_quantum") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_quota_quantum, x);
} else if (strcmp(param, "quota_scale") == 0) {
unsigned int y;
if (sscanf(value, "%u %u", &x, &y) != 2 || !y)
return -EINVAL;
spin_lock(&gt->gt_spin);
gt->gt_quota_scale_num = x;
gt->gt_quota_scale_den = y;
spin_unlock(&gt->gt_spin);
} else if (strcmp(param, "quota_enforce") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
x = !!x;
spin_lock(&gt->gt_spin);
gt->gt_quota_enforce = x;
if (x)
gt->gt_quota_account = 1;
spin_unlock(&gt->gt_spin);
} else if (strcmp(param, "quota_account") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
x = !!x;
spin_lock(&gt->gt_spin);
gt->gt_quota_account = x;
if (x)
spin_unlock(&gt->gt_spin);
else {
unsigned int y;
gt->gt_quota_enforce = 0;
spin_unlock(&gt->gt_spin);
for (y = 0; y < 2; y++) {
gfs_log_flush(sdp);
gfs_sync_meta(sdp);
gfs_quota_sync(sdp);
}
}
} else if (strcmp(param, "new_files_jdata") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
x = !!x;
tune_set(gt_new_files_jdata, x);
} else if (strcmp(param, "new_files_directio") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
x = !!x;
tune_set(gt_new_files_directio, x);
} else if (strcmp(param, "max_atomic_write") == 0) {
if (sscanf(value, "%u", &x) != 1 || !x)
return -EINVAL;
tune_set(gt_max_atomic_write, x);
} else if (strcmp(param, "max_readahead") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_max_readahead, x);
} else if (strcmp(param, "lockdump_size") == 0) {
if (sscanf(value, "%u", &x) != 1 || !x)
return -EINVAL;
tune_set(gt_lockdump_size, x);
} else if (strcmp(param, "stall_secs") == 0) {
if (sscanf(value, "%u", &x) != 1 || !x)
return -EINVAL;
tune_set(gt_stall_secs, x);
} else if (strcmp(param, "complain_secs") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_complain_secs, x);
} else if (strcmp(param, "reclaim_limit") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_reclaim_limit, x);
} else if (strcmp(param, "entries_per_readdir") == 0) {
if (sscanf(value, "%u", &x) != 1 || !x)
return -EINVAL;
tune_set(gt_entries_per_readdir, x);
} else if (strcmp(param, "prefetch_secs") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_prefetch_secs, x);
} else if (strcmp(param, "statfs_slots") == 0) {
if (sscanf(value, "%u", &x) != 1 || !x)
return -EINVAL;
tune_set(gt_statfs_slots, x);
} else if (strcmp(param, "max_mhc") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_max_mhc, x);
} else if (strcmp(param, "greedy_default") == 0) {
if (sscanf(value, "%u", &x) != 1 || !x)
return -EINVAL;
tune_set(gt_greedy_default, x);
} else if (strcmp(param, "greedy_quantum") == 0) {
if (sscanf(value, "%u", &x) != 1 || !x)
return -EINVAL;
tune_set(gt_greedy_quantum, x);
} else if (strcmp(param, "greedy_max") == 0) {
if (sscanf(value, "%u", &x) != 1 || !x)
return -EINVAL;
tune_set(gt_greedy_max, x);
} else if (strcmp(param, "rgrp_try_threshold") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
tune_set(gt_rgrp_try_threshold, x);
} else if (strcmp(param, "statfs_fast") == 0) {
if (sscanf(value, "%u", &x) != 1)
return -EINVAL;
error = gfs_statfs_init(sdp, x);
if (error)
return error;
else
tune_set(gt_statfs_fast, x);
-
-
+ } else if (strcmp(param, "max_rgrp_free_mdata") == 0) {
+ if (sscanf(value, "%u", &x) != 1)
+ return -EINVAL;
+ tune_set(gt_max_rgrp_free_mdata, x);
} else
return -EINVAL;
return 0;
}
/**
* gi_do_reclaim - Reclaim unused metadata
* @ip:
* @gi:
* @buf:
* @size:
* @count:
*
* Returns: errno
*/
static int
gi_do_reclaim(struct gfs_inode *ip,
struct gfs_ioctl *gi,
char *buf,
unsigned int size,
unsigned int *count)
{
uint64_t inodes, metadata;
int error;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (gi->gi_argc != 1)
return -EINVAL;
error = gfs_reclaim_metadata(ip->i_sbd,
&inodes,
- &metadata);
+ &metadata,
+ ip->i_sbd->sd_rgcount);
if (error)
return error;
error = -ENOBUFS;
gfs_printf("version 0\n");
gfs_printf("inodes %"PRIu64"\n", inodes);
gfs_printf("metadata %"PRIu64"\n", metadata);
error = 0;
out:
return error;
}
/**
* gi_do_shrink - throw out unused glocks
* @sdp:
* @gi:
*
* Returns: 0
*/
static int
gi_do_shrink(struct gfs_sbd *sdp, struct gfs_ioctl *gi)
{
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (gi->gi_argc != 1)
return -EINVAL;
gfs_gl_hash_clear(sdp, FALSE);
return 0;
}
/**
* gi_get_file_stat -
* @ip:
* @gi:
*
* Returns: the number of bytes copied, or -errno
*/
static int
gi_get_file_stat(struct gfs_inode *ip, struct gfs_ioctl *gi)
{
struct gfs_holder i_gh;
struct gfs_dinode *di;
int error;
if (gi->gi_argc != 1)
return -EINVAL;
if (gi->gi_size != sizeof(struct gfs_dinode))
return -EINVAL;
di = kmalloc(sizeof(struct gfs_dinode), GFP_KERNEL);
if (!di)
return -ENOMEM;
error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
if (error)
goto out;
memcpy(di, &ip->i_di, sizeof(struct gfs_dinode));
gfs_glock_dq_uninit(&i_gh);
if (copy_to_user(gi->gi_data, di,
sizeof(struct gfs_dinode)))
error = -EFAULT;
else
error = sizeof(struct gfs_dinode);
out:
kfree(di);
return error;
}
/**
* gi_set_file_flag - set or clear a flag on a file
* @ip:
* @gi:
*
* Returns: errno
*/
static int
gi_set_file_flag(struct gfs_inode *ip, struct gfs_ioctl *gi, int from_user)
{
char buf[ARG_SIZE];
int set;
uint32_t flag;
struct gfs_holder i_gh;
struct buffer_head *dibh;
int error;
if (gi->gi_argc != 3)
return -EINVAL;
if (from_user) {
if (strncpy_from_user(buf, gi->gi_argv[1], ARG_SIZE) < 0)
return -EFAULT;
} else {
strncpy(buf, gi->gi_argv[1], ARG_SIZE);
}
buf[ARG_SIZE - 1] = 0;
if (strcmp(buf, "set") == 0)
set = TRUE;
else if (strcmp(buf, "clear") == 0)
set = FALSE;
else
return -EINVAL;
if (from_user) {
if (strncpy_from_user(buf, gi->gi_argv[2], ARG_SIZE) < 0)
return -EFAULT;
} else {
strncpy(buf, gi->gi_argv[2], ARG_SIZE);
}
buf[ARG_SIZE - 1] = 0;
error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
if (error)
return error;
error = -EACCES;
if (ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
goto out;
error = -EINVAL;
if (strcmp(buf, "jdata") == 0) {
if (ip->i_di.di_type != GFS_FILE_REG ||
ip->i_di.di_size)
goto out;
flag = GFS_DIF_JDATA;
} else if (strcmp(buf, "directio") == 0) {
if (ip->i_di.di_type != GFS_FILE_REG)
goto out;
flag = GFS_DIF_DIRECTIO;
} else if (strcmp(buf, "immutable") == 0) {
/* The IMMUTABLE flag can only be changed by
the relevant capability. */
error = -EPERM;
if (!capable(CAP_LINUX_IMMUTABLE))
goto out;
flag = GFS_DIF_IMMUTABLE;
} else if (strcmp(buf, "appendonly") == 0) {
/* The APPENDONLY flag can only be changed by
the relevant capability. */
error = -EPERM;
if (!capable(CAP_LINUX_IMMUTABLE))
goto out;
flag = GFS_DIF_APPENDONLY;
} else if (strcmp(buf, "inherit_jdata") == 0) {
if (ip->i_di.di_type != GFS_FILE_DIR)
goto out;
flag = GFS_DIF_INHERIT_JDATA;
} else if (strcmp(buf, "inherit_directio") == 0) {
if (ip->i_di.di_type != GFS_FILE_DIR)
goto out;
flag = GFS_DIF_INHERIT_DIRECTIO;
} else
goto out;
error = gfs_trans_begin(ip->i_sbd, 1, 0);
if (error)
goto out;
error = gfs_get_inode_buffer(ip, &dibh);
if (error)
goto out_trans_end;
if (set)
ip->i_di.di_flags |= flag;
else
ip->i_di.di_flags &= ~flag;
gfs_trans_add_bh(ip->i_gl, dibh);
gfs_dinode_out(&ip->i_di, dibh->b_data);
brelse(dibh);
out_trans_end:
gfs_trans_end(ip->i_sbd);
out:
gfs_glock_dq_uninit(&i_gh);
return error;
}
/**
* gi_get_file_meta - Return all the metadata for a file
* @ip:
* @gi:
*
* Returns: the number of bytes copied, or -errno
*/
static int
gi_get_file_meta(struct gfs_inode *ip, struct gfs_ioctl *gi)
{
struct gfs_holder i_gh;
struct gfs_user_buffer ub;
int error;
if (gi->gi_argc != 1)
return -EINVAL;
ub.ub_data = gi->gi_data;
ub.ub_size = gi->gi_size;
ub.ub_count = 0;
error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
if (error)
return error;
error = -EACCES;
if (ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
goto out;
error = gfs_get_file_meta(ip, &ub);
if (error)
goto out;
if (ip->i_di.di_type == GFS_FILE_DIR &&
(ip->i_di.di_flags & GFS_DIF_EXHASH)) {
error = gfs_get_dir_meta(ip, &ub);
if (error)
goto out;
}
if (ip->i_di.di_eattr) {
error = gfs_get_eattr_meta(ip, &ub);
if (error)
goto out;
}
error = ub.ub_count;
out:
gfs_glock_dq_uninit(&i_gh);
return error;
}
/**
* gi_do_file_flush - sync out all dirty data and
* drop the cache (and lock) for a file.
* @ip:
* @gi:
*
* Returns: errno
*/
static int
gi_do_file_flush(struct gfs_inode *ip, struct gfs_ioctl *gi)
{
if (gi->gi_argc != 1)
return -EINVAL;
gfs_glock_force_drop(ip->i_gl);
return 0;
}
/**
* gi2hip - return the "struct gfs_inode" for a hidden file
* @sdp:
* @gi:
*
* Returns: the "struct gfs_inode"
*/
static struct gfs_inode *
gi2hip(struct gfs_sbd *sdp, struct gfs_ioctl *gi, int from_user)
{
char buf[ARG_SIZE];
if (gi->gi_argc != 2)
return ERR_PTR(-EINVAL);
if (from_user) {
if (strncpy_from_user(buf, gi->gi_argv[1], ARG_SIZE) < 0)
return ERR_PTR(-EFAULT);
} else {
strncpy(buf, gi->gi_argv[1], ARG_SIZE);
}
buf[ARG_SIZE - 1] = 0;
if (strcmp(buf, "jindex") == 0)
return sdp->sd_jiinode;
else if (strcmp(buf, "rindex") == 0)
return sdp->sd_riinode;
else if (strcmp(buf, "quota") == 0)
return sdp->sd_qinode;
else if (strcmp(buf, "license") == 0)
return sdp->sd_linode;
else
return ERR_PTR(-EINVAL);
}
/**
* gi_get_hfile_stat - get stat info on a hidden file
* @sdp:
* @gi:
*
* Returns: the number of bytes copied, or -errno
*/
static int
gi_get_hfile_stat(struct gfs_sbd *sdp, struct gfs_ioctl *gi, int from_user)
{
struct gfs_inode *ip;
struct gfs_dinode *di;
struct gfs_holder i_gh;
int error;
ip = gi2hip(sdp, gi, from_user);
if (IS_ERR(ip))
return PTR_ERR(ip);
if (gi->gi_size != sizeof(struct gfs_dinode))
return -EINVAL;
di = kmalloc(sizeof(struct gfs_dinode), GFP_KERNEL);
if (!di)
return -ENOMEM;
error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
if (error)
goto out;
memcpy(di, &ip->i_di, sizeof(struct gfs_dinode));
gfs_glock_dq_uninit(&i_gh);
if (copy_to_user(gi->gi_data, di,
sizeof(struct gfs_dinode)))
error = -EFAULT;
else
error = sizeof(struct gfs_dinode);
out:
kfree(di);
return error;
}
/**
* gi_do_hfile_read - Read data from a hidden file
* @sdp:
* @gi:
*
* Returns: the number of bytes read, or -errno
*/
static int
gi_do_hfile_read(struct gfs_sbd *sdp, struct gfs_ioctl *gi, int from_user)
{
struct gfs_inode *ip;
struct gfs_holder i_gh;
int error;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
ip = gi2hip(sdp, gi, from_user);
if (IS_ERR(ip))
return PTR_ERR(ip);
if (!access_ok(VERIFY_WRITE, gi->gi_data, gi->gi_size))
return -EFAULT;
error = gfs_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
if (error)
return error;
error = gfs_readi(ip, gi->gi_data, gi->gi_offset, gi->gi_size,
gfs_copy2user);
gfs_glock_dq_uninit(&i_gh);
return error;
}
/**
* gi_do_hfile_write - Write data to a hidden file
* @sdp:
* @gi:
*
* Returns: the number of bytes written, or -errno
*/
static int
gi_do_hfile_write(struct gfs_sbd *sdp, struct gfs_ioctl *gi, int from_user)
{
struct gfs_inode *ip;
struct gfs_alloc *al = NULL;
struct gfs_holder i_gh;
unsigned int data_blocks, ind_blocks;
int alloc_required;
int error;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
ip = gi2hip(sdp, gi, from_user);
if (IS_ERR(ip))
return PTR_ERR(ip);
if (!access_ok(VERIFY_READ, gi->gi_data, gi->gi_size))
return -EFAULT;
gfs_write_calc_reserv(ip, gi->gi_size, &data_blocks, &ind_blocks);
error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE,
LM_FLAG_PRIORITY | GL_SYNC | GL_NOCANCEL_OTHER,
&i_gh);
if (error)
return error;
if (!gfs_is_jdata(ip)) {
gfs_consist_inode(ip);
error = -EIO;
goto out;
}
error = gfs_write_alloc_required(ip, gi->gi_offset, gi->gi_size,
&alloc_required);
if (error)
goto out;
if (alloc_required) {
al = gfs_alloc_get(ip);
error = gfs_quota_hold_m(ip, NO_QUOTA_CHANGE,
NO_QUOTA_CHANGE);
if (error)
goto out_alloc;
al->al_requested_meta = ind_blocks + data_blocks;
error = gfs_inplace_reserve(ip);
if (error)
goto out_qs;
/* Trans may require:
All blocks for a RG bitmap, all the "data" blocks, whatever
indirect blocks we need, a modified dinode, and a quota change */
error = gfs_trans_begin(sdp,
1 + al->al_rgd->rd_ri.ri_length +
ind_blocks + data_blocks, 1);
if (error)
goto out_relse;
} else {
/* Trans may require:
All the "data" blocks and a modified dinode. */
error = gfs_trans_begin(sdp, 1 + data_blocks, 0);
if (error)
goto out_relse;
}
if (from_user)
error = gfs_writei(ip, gi->gi_data, gi->gi_offset, gi->gi_size,
gfs_copy_from_user, NULL);
else
error = gfs_writei(ip, gi->gi_data, gi->gi_offset, gi->gi_size,
gfs_copy_from_mem, NULL);
gfs_trans_end(sdp);
out_relse:
if (alloc_required) {
gfs_assert_warn(sdp, error || al->al_alloced_meta);
gfs_inplace_release(ip);
}
out_qs:
if (alloc_required)
gfs_quota_unhold_m(ip);
out_alloc:
if (alloc_required)
gfs_alloc_put(ip);
out:
ip->i_gl->gl_vn++;
gfs_glock_dq_uninit(&i_gh);
return error;
}
/**
* gi_do_hfile_trunc - truncate a hidden file
* @sdp:
* @gi:
*
* Returns: the number of bytes copied, or -errno
*/
static int
gi_do_hfile_trunc(struct gfs_sbd *sdp, struct gfs_ioctl *gi, int from_user)
{
struct gfs_inode *ip;
struct gfs_holder i_gh;
int error;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
ip = gi2hip(sdp, gi, from_user);
if (IS_ERR(ip))
return PTR_ERR(ip);
error = gfs_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SYNC, &i_gh);
if (error)
return error;
error = gfs_truncatei(ip, gi->gi_offset, NULL);
ip->i_gl->gl_vn++;
gfs_glock_dq_uninit(&i_gh);
return error;
}
/**
* gi_do_quota_sync - sync the outstanding quota changes for a FS
* @sdp:
* @gi:
*
* Returns: errno
*/
static int
gi_do_quota_sync(struct gfs_sbd *sdp, struct gfs_ioctl *gi)
{
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (gi->gi_argc != 1)
return -EINVAL;
return gfs_quota_sync(sdp);
}
/**
* gi_do_quota_refresh - Refresh the a quota LVB from the quota file
* @sdp:
* @gi:
*
* Returns: errno
*/
static int
gi_do_quota_refresh(struct gfs_sbd *sdp, struct gfs_ioctl *gi, int from_user)
{
char buf[ARG_SIZE];
int user;
uint32_t id;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (gi->gi_argc != 2)
return -EINVAL;
if (from_user) {
if (strncpy_from_user(buf, gi->gi_argv[1], ARG_SIZE) < 0)
return -EFAULT;
} else {
strncpy(buf, gi->gi_argv[1], ARG_SIZE);
}
buf[ARG_SIZE - 1] = 0;
switch (buf[0]) {
case 'u':
user = TRUE;
break;
case 'g':
user = FALSE;
break;
default:
return -EINVAL;
}
if (buf[1] != ':')
return -EINVAL;
if (sscanf(buf + 2, "%u", &id) != 1)
return -EINVAL;
return gfs_quota_refresh(sdp, user, id);
}
/**
* gi_do_quota_read - read quota values from the quota file
* @sdp:
* @gi:
*
* Returns: errno
*/
static int
gi_do_quota_read(struct gfs_sbd *sdp, struct gfs_ioctl *gi, int from_user)
{
char buf[ARG_SIZE];
int user;
uint32_t id;
struct gfs_quota q;
int error;
if (gi->gi_argc != 2)
return -EINVAL;
if (gi->gi_size != sizeof(struct gfs_quota))
return -EINVAL;
if (from_user) {
if (strncpy_from_user(buf, gi->gi_argv[1], ARG_SIZE) < 0)
return -EFAULT;
} else {
strncpy(buf, gi->gi_argv[1], ARG_SIZE);
}
buf[ARG_SIZE - 1] = 0;
switch (buf[0]) {
case 'u':
user = TRUE;
break;
case 'g':
user = FALSE;
break;
default:
return -EINVAL;
}
if (buf[1] != ':')
return -EINVAL;
if (sscanf(buf + 2, "%u", &id) != 1)
return -EINVAL;
error = gfs_quota_read(sdp, user, id, &q);
if (error)
return error;
if (copy_to_user(gi->gi_data, &q, sizeof(struct gfs_quota)))
return -EFAULT;
return 0;
}
int
gfs_ioctl_i_local(struct gfs_inode *ip, struct gfs_ioctl *gi, const char *arg0,
int from_user)
{
int error = -EFAULT;
if (strcmp(arg0, "get_cookie") == 0)
error = gi_skeleton(ip, gi, gi_get_cookie);
else if (strcmp(arg0, "get_super") == 0)
error = gi_get_super(ip->i_sbd, gi);
else if (strcmp(arg0, "get_args") == 0)
error = gi_skeleton(ip, gi, gi_get_args);
else if (strcmp(arg0, "get_lockstruct") == 0)
error = gi_skeleton(ip, gi, gi_get_lockstruct);
else if (strcmp(arg0, "get_stat_gfs") == 0)
error = gi_skeleton(ip, gi, gi_get_stat_gfs);
else if (strcmp(arg0, "get_counters") == 0)
error = gi_skeleton(ip, gi, gi_get_counters);
else if (strcmp(arg0, "get_tune") == 0)
error = gi_skeleton(ip, gi, gi_get_tune);
else if (strcmp(arg0, "set_tune") == 0)
error = gi_set_tune(ip->i_sbd, gi, from_user);
else if (strcmp(arg0, "do_reclaim") == 0)
error = gi_skeleton(ip, gi, gi_do_reclaim);
else if (strcmp(arg0, "do_shrink") == 0)
error = gi_do_shrink(ip->i_sbd, gi);
else if (strcmp(arg0, "get_file_stat") == 0)
error = gi_get_file_stat(ip, gi);
else if (strcmp(arg0, "set_file_flag") == 0)
error = gi_set_file_flag(ip, gi, from_user);
else if (strcmp(arg0, "get_file_meta") == 0)
error = gi_get_file_meta(ip, gi);
else if (strcmp(arg0, "get_file_meta_quota") == 0)
error = gi_get_file_meta(ip->i_sbd->sd_qinode, gi);
else if (strcmp(arg0, "do_file_flush") == 0)
error = gi_do_file_flush(ip, gi);
else if (strcmp(arg0, "get_hfile_stat") == 0)
error = gi_get_hfile_stat(ip->i_sbd, gi, from_user);
else if (strcmp(arg0, "do_hfile_read") == 0)
error = gi_do_hfile_read(ip->i_sbd, gi, from_user);
else if (strcmp(arg0, "do_hfile_write") == 0)
error = gi_do_hfile_write(ip->i_sbd, gi, from_user);
else if (strcmp(arg0, "do_hfile_trunc") == 0)
error = gi_do_hfile_trunc(ip->i_sbd, gi, from_user);
else if (strcmp(arg0, "do_quota_sync") == 0)
error = gi_do_quota_sync(ip->i_sbd, gi);
else if (strcmp(arg0, "do_quota_refresh") == 0)
error = gi_do_quota_refresh(ip->i_sbd, gi, from_user);
else if (strcmp(arg0, "do_quota_read") == 0)
error = gi_do_quota_read(ip->i_sbd, gi, from_user);
else
error = -ENOTTY;
return error;
}
/**
* gfs_ioctl_i - Normal ioctls
* @ip:
* @arg:
*
* Returns: -errno or positive byte count
*/
int
gfs_ioctl_i(struct gfs_inode *ip, void *arg)
{
struct gfs_ioctl *gi_user = (struct gfs_ioctl *)arg;
struct gfs_ioctl gi;
char **argv;
char arg0[ARG_SIZE];
int error = -EFAULT;
if (copy_from_user(&gi, gi_user, sizeof(struct gfs_ioctl)))
return -EFAULT;
if (!gi.gi_argc)
return -EINVAL;
argv = kmalloc(gi.gi_argc * sizeof(char *), GFP_KERNEL);
if (!argv)
return -ENOMEM;
if (copy_from_user(argv, gi.gi_argv,
gi.gi_argc * sizeof(char *)))
goto out;
gi.gi_argv = argv;
if (strncpy_from_user(arg0, argv[0], ARG_SIZE) < 0)
goto out;
arg0[ARG_SIZE - 1] = 0;
error = gfs_ioctl_i_local(ip, &gi, arg0, 1);
out:
kfree(argv);
return error;
}
#ifdef CONFIG_COMPAT
/**
* gfs_ioctl_i_compat - compatibility ioctls
* These ioctls are used to provide ioctls for situations
* where userland and kernel arch is different.
* For example, userland may be 32-bit ppc whereas the
* kernel may be ppc64. In this case, we need to do
* extra translation between the addresses.
* @ip:
* @arg:
*
* Returns: -errno or positive byte count
*/
int
gfs_ioctl_i_compat(struct gfs_inode *ip, unsigned long arg)
{
struct gfs_ioctl_compat *src;
struct gfs_ioctl dst;
char **argv, *argptr;
uint32_t *ptr;
char arg0[ARG_SIZE];
char *tmparg;
int i;
int error = -EFAULT;
src = (struct gfs_ioctl_compat *)compat_ptr(arg);
memset(&dst, 0, sizeof(dst));
dst.gi_argc = src->gi_argc;
dst.gi_size = src->gi_size;
dst.gi_offset = src->gi_offset;
argv = kmalloc(dst.gi_argc * sizeof(char *), GFP_KERNEL);
if (!argv)
return -ENOMEM;
memset(argv, 0, dst.gi_argc * sizeof(char *));
ptr = (uint32_t *)compat_ptr(src->gi_argv);
for (i = 0; i < dst.gi_argc; i++) { /* for each parm */
tmparg = kmalloc(ARG_SIZE * sizeof(char *), GFP_KERNEL);
if (!tmparg)
goto out;
argptr = (char *)compat_ptr(*ptr);
if (strncpy_from_user(tmparg, argptr, ARG_SIZE) < 0)
goto out;
argv[i] = tmparg;
ptr++;
}
strncpy(arg0, argv[0], ARG_SIZE);
arg0[ARG_SIZE - 1] = 0;
dst.gi_argv = argv;
dst.gi_data = compat_ptr(src->gi_data);
error = gfs_ioctl_i_local(ip, &dst, arg0, 0);
out:
for (i = 0; i < dst.gi_argc; i++)
kfree(argv[i]);
kfree(argv);
return error;
}
#endif
diff --git a/gfs-kernel/src/gfs/rgrp.c b/gfs-kernel/src/gfs/rgrp.c
index e0c670ffa..e3de6b19a 100644
--- a/gfs-kernel/src/gfs/rgrp.c
+++ b/gfs-kernel/src/gfs/rgrp.c
@@ -1,2152 +1,2164 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/spinlock.h>
#include <linux/semaphore.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
#include "gfs.h"
#include "bits.h"
#include "dio.h"
#include "file.h"
#include "glock.h"
#include "glops.h"
#include "rgrp.h"
#include "super.h"
#include "trans.h"
/**
* mhc_hash: find the mhc hash bucket for a buffer
* @bh: the buffer
*
* Returns: The bucket number
*/
static unsigned int
mhc_hash(struct buffer_head *bh)
{
uint64_t blkno;
unsigned int h;
blkno = bh->b_blocknr;
h = gfs_hash(&blkno, sizeof(uint64_t)) & GFS_MHC_HASH_MASK;
return h;
}
/**
* mhc_trim - Throw away cached meta-headers, if there are too many of them
* @sdp: The filesystem instance
* @max: Max # of cached meta-headers allowed to survive
*
* Walk filesystem's list of cached meta-headers, in least-recently-used order,
* and keep throwing them away until we're under the max threshold.
*/
static void
mhc_trim(struct gfs_sbd *sdp, unsigned int max)
{
struct gfs_meta_header_cache *mc;
for (;;) {
spin_lock(&sdp->sd_mhc_lock);
if (list_empty(&sdp->sd_mhc_single)) {
spin_unlock(&sdp->sd_mhc_lock);
return;
} else {
mc = list_entry(sdp->sd_mhc_single.prev,
struct gfs_meta_header_cache,
mc_list_single);
list_del(&mc->mc_list_hash);
list_del(&mc->mc_list_single);
list_del(&mc->mc_list_rgd);
spin_unlock(&sdp->sd_mhc_lock);
kmem_cache_free(gfs_mhc_cachep, mc);
atomic_dec(&sdp->sd_mhc_count);
if (atomic_read(&sdp->sd_mhc_count) <= max)
return;
}
}
}
/**
* gfs_mhc_add - add buffer(s) to the cache of metadata headers
* @rgd: Resource Group in which the buffered block(s) reside
* @bh: an array of buffer_head pointers
* @num: the number of bh pointers in the array
*
* Increment each meta-header's generation # by 2.
* Alloc and add each gfs_meta-header_cache to 3 lists/caches:
* Filesystem's meta-header cache (hash)
* Filesystem's list of cached meta-headers
* Resource Group's list of cached meta-headers
* If we now have too many cached, throw some older ones away
*/
void
gfs_mhc_add(struct gfs_rgrpd *rgd,
struct buffer_head **bh, unsigned int num)
{
struct gfs_sbd *sdp = rgd->rd_sbd;
unsigned int x;
for (x = 0; x < num; x++) {
struct gfs_meta_header_cache *mc;
struct list_head *head;
uint64_t gen;
if (gfs_meta_check(sdp, bh[x]))
return;
mc = kmem_cache_alloc(gfs_mhc_cachep, GFP_KERNEL);
if (!mc)
return;
memset(mc, 0, sizeof(struct gfs_meta_header_cache));
mc->mc_block = bh[x]->b_blocknr;
memcpy(&mc->mc_mh, bh[x]->b_data,
sizeof(struct gfs_meta_header));
gen = gfs64_to_cpu(mc->mc_mh.mh_generation) + 2;
mc->mc_mh.mh_generation = cpu_to_gfs64(gen);
head = &sdp->sd_mhc[mhc_hash(bh[x])];
spin_lock(&sdp->sd_mhc_lock);
list_add(&mc->mc_list_hash, head);
list_add(&mc->mc_list_single, &sdp->sd_mhc_single);
list_add(&mc->mc_list_rgd, &rgd->rd_mhc);
spin_unlock(&sdp->sd_mhc_lock);
atomic_inc(&sdp->sd_mhc_count);
}
x = gfs_tune_get(sdp, gt_max_mhc);
/* If we've got too many cached, throw some older ones away */
if (atomic_read(&sdp->sd_mhc_count) > x)
mhc_trim(sdp, x);
}
/**
* gfs_mhc_fish - Try to fill in a meta buffer with meta-header from the cache
* @sdp: the filesystem
* @bh: the buffer to fill in
*
* Returns: TRUE if the buffer was cached, FALSE otherwise
*
* If buffer is referenced in meta-header cache (search using hash):
* Copy the cached meta-header into the buffer (instead of reading from disk).
* Note that only the meta-header portion of the buffer will have valid data
* (as would be on disk), rest of buffer does *not* reflect disk contents.
* Remove cached gfs_meta_header_cache from all cache lists, free its memory.
*/
int
gfs_mhc_fish(struct gfs_sbd *sdp, struct buffer_head *bh)
{
struct list_head *tmp, *head;
struct gfs_meta_header_cache *mc;
head = &sdp->sd_mhc[mhc_hash(bh)];
spin_lock(&sdp->sd_mhc_lock);
for (tmp = head->next;
tmp != head;
tmp = tmp->next) {
mc = list_entry(tmp, struct gfs_meta_header_cache, mc_list_hash);
if (mc->mc_block != bh->b_blocknr)
continue;
list_del(&mc->mc_list_hash);
list_del(&mc->mc_list_single);
list_del(&mc->mc_list_rgd);
spin_unlock(&sdp->sd_mhc_lock);
gfs_prep_new_buffer(bh);
memcpy(bh->b_data, &mc->mc_mh,
sizeof(struct gfs_meta_header));
kmem_cache_free(gfs_mhc_cachep, mc);
atomic_dec(&sdp->sd_mhc_count);
return TRUE;
}
spin_unlock(&sdp->sd_mhc_lock);
return FALSE;
}
/**
* gfs_mhc_zap - Throw away an RG's list of cached metadata headers
* @rgd: The resource group whose list we want to clear
*
* Simply throw away all cached metadata headers on RG's list,
* and free their memory.
*/
void
gfs_mhc_zap(struct gfs_rgrpd *rgd)
{
struct gfs_sbd *sdp = rgd->rd_sbd;
struct gfs_meta_header_cache *mc;
spin_lock(&sdp->sd_mhc_lock);
while (!list_empty(&rgd->rd_mhc)) {
mc = list_entry(rgd->rd_mhc.next,
struct gfs_meta_header_cache,
mc_list_rgd);
list_del(&mc->mc_list_hash);
list_del(&mc->mc_list_single);
list_del(&mc->mc_list_rgd);
spin_unlock(&sdp->sd_mhc_lock);
kmem_cache_free(gfs_mhc_cachep, mc);
atomic_dec(&sdp->sd_mhc_count);
spin_lock(&sdp->sd_mhc_lock);
}
spin_unlock(&sdp->sd_mhc_lock);
}
/**
* depend_hash() - Turn glock number into hash bucket number
* @formal_ino:
*
* Returns: The number of the corresponding hash bucket
*/
static unsigned int
depend_hash(uint64_t formal_ino)
{
unsigned int h;
h = gfs_hash(&formal_ino, sizeof(uint64_t));
h &= GFS_DEPEND_HASH_MASK;
return h;
}
/**
* depend_sync_one - Sync metadata (not data) for a dependency inode
* @sdp: filesystem instance
* @gd: dependency descriptor
*
* Remove dependency from superblock's hash table and rgrp's list.
* Sync dependency inode's metadata to log and in-place location.
*/
static void
depend_sync_one(struct gfs_sbd *sdp, struct gfs_depend *gd)
{
struct gfs_glock *gl;
spin_lock(&sdp->sd_depend_lock);
list_del(&gd->gd_list_hash);
spin_unlock(&sdp->sd_depend_lock);
list_del(&gd->gd_list_rgd);
gl = gfs_glock_find(sdp,
&(struct lm_lockname){gd->gd_formal_ino,
LM_TYPE_INODE});
if (gl) {
if (gl->gl_ops->go_sync)
gl->gl_ops->go_sync(gl,
DIO_METADATA |
DIO_INVISIBLE);
gfs_glock_put(gl);
}
kfree(gd);
atomic_dec(&sdp->sd_depend_count);
}
/**
* depend_sync_old - Sync older rgrp-dependent inodes to disk.
* @rgd: Resource group containing dependent inodes
*
* Look at oldest entries in resource group's dependency list,
* sync 'em if they're older than timeout threshold.
*/
static void
depend_sync_old(struct gfs_rgrpd *rgd)
{
struct gfs_sbd *sdp = rgd->rd_sbd;
struct gfs_depend *gd;
while (!list_empty(&rgd->rd_depend)) {
/* Oldest entries are in prev direction */
gd = list_entry(rgd->rd_depend.prev,
struct gfs_depend,
gd_list_rgd);
if (time_before(jiffies,
gd->gd_time +
gfs_tune_get(sdp, gt_depend_secs) * HZ))
return;
depend_sync_one(sdp, gd);
}
}
/**
* gfs_depend_add - Add a dependent inode to rgrp's and filesystem's list
* @rgd: Resource group containing blocks associated with inode
* @formal_ino: inode
*
* Dependent inodes must be flushed to log and in-place blocks before
* releasing an EXCLUSIVE rgrp lock.
* Find pre-existing dependency for this inode/rgrp combination in
* incore superblock struct's sd_depend hash table, or create a new one.
* Either way, move or attach dependency to head of superblock's hash bucket
* and top of rgrp's list.
* If we create a new one, take a moment to sync older dependencies to disk.
*/
void
gfs_depend_add(struct gfs_rgrpd *rgd, uint64_t formal_ino)
{
struct gfs_sbd *sdp = rgd->rd_sbd;
struct list_head *head, *tmp;
struct gfs_depend *gd;
head = &sdp->sd_depend[depend_hash(formal_ino)];
spin_lock(&sdp->sd_depend_lock);
for (tmp = head->next;
tmp != head;
tmp = tmp->next) {
gd = list_entry(tmp, struct gfs_depend, gd_list_hash);
if (gd->gd_rgd == rgd &&
gd->gd_formal_ino == formal_ino) {
list_move(&gd->gd_list_hash, head);
spin_unlock(&sdp->sd_depend_lock);
list_move(&gd->gd_list_rgd, &rgd->rd_depend);
gd->gd_time = jiffies;
return;
}
}
spin_unlock(&sdp->sd_depend_lock);
gd = gmalloc(sizeof(struct gfs_depend));
memset(gd, 0, sizeof(struct gfs_depend));
gd->gd_rgd = rgd;
gd->gd_formal_ino = formal_ino;
gd->gd_time = jiffies;
spin_lock(&sdp->sd_depend_lock);
list_add(&gd->gd_list_hash, head);
spin_unlock(&sdp->sd_depend_lock);
list_add(&gd->gd_list_rgd, &rgd->rd_depend);
atomic_inc(&sdp->sd_depend_count);
depend_sync_old(rgd);
}
/**
* gfs_depend_sync - Sync metadata (not data) for an rgrp's dependent inodes
* @rgd: Resource group containing the dependent inodes
*
* As long as this node owns an EXCLUSIVE lock on the rgrp, we can keep
* rgrp's modified metadata blocks in buffer cache.
*
* When this node releases the EX lock, we must flush metadata, so other
* nodes can read the modified content from disk.
*/
void
gfs_depend_sync(struct gfs_rgrpd *rgd)
{
struct gfs_sbd *sdp = rgd->rd_sbd;
struct gfs_depend *gd;
while (!list_empty(&rgd->rd_depend)) {
gd = list_entry(rgd->rd_depend.next,
struct gfs_depend,
gd_list_rgd);
depend_sync_one(sdp, gd);
}
}
/**
* rgrp_verify - Verify that a resource group is consistent
* @sdp: the filesystem
* @rgd: the rgrp
*
* Somebody should have already called gfs_glock_rg() on this RG.
*/
static void
rgrp_verify(struct gfs_rgrpd *rgd)
{
struct gfs_sbd *sdp = rgd->rd_sbd;
struct gfs_bitmap *bits = NULL;
uint32_t length = rgd->rd_ri.ri_length;
uint32_t count[4], tmp;
int buf, x;
memset(count, 0, 4 * sizeof(uint32_t));
/* Count # blocks in each of 4 possible allocation states */
for (buf = 0; buf < length; buf++) {
bits = &rgd->rd_bits[buf];
for (x = 0; x < 4; x++)
count[x] += gfs_bitcount(rgd,
rgd->rd_bh[buf]->b_data +
bits->bi_offset,
bits->bi_len, x);
}
if (count[0] != rgd->rd_rg.rg_free) {
if (gfs_consist_rgrpd(rgd))
printk("GFS: fsid=%s: free data mismatch: %u != %u\n",
sdp->sd_fsname, count[0], rgd->rd_rg.rg_free);
return;
}
tmp = rgd->rd_ri.ri_data -
(rgd->rd_rg.rg_usedmeta + rgd->rd_rg.rg_freemeta) -
(rgd->rd_rg.rg_useddi + rgd->rd_rg.rg_freedi) -
rgd->rd_rg.rg_free;
if (count[1] != tmp) {
if (gfs_consist_rgrpd(rgd))
printk("GFS: fsid=%s: used data mismatch: %u != %u\n",
sdp->sd_fsname, count[1], tmp);
return;
}
if (count[2] != rgd->rd_rg.rg_freemeta) {
if (gfs_consist_rgrpd(rgd))
printk("GFS: fsid=%s: free metadata mismatch: %u != %u\n",
sdp->sd_fsname, count[2], rgd->rd_rg.rg_freemeta);
return;
}
tmp = rgd->rd_rg.rg_usedmeta +
(rgd->rd_rg.rg_useddi + rgd->rd_rg.rg_freedi);
if (count[3] != tmp) {
if (gfs_consist_rgrpd(rgd))
printk("GFS: fsid=%s: used metadata mismatch: %u != %u\n",
sdp->sd_fsname, count[3], tmp);
return;
}
}
/**
* gfs_blk2rgrpd - Find resource group for a given data/meta block number
* @sdp: The GFS superblock
* @n: The data block number
*
* Returns: The resource group, or NULL if not found
*
* Don't try to use this for non-allocatable block numbers (i.e. rgrp header
* or bitmap blocks); it's for allocatable (data/meta) blocks only.
*/
struct gfs_rgrpd *
gfs_blk2rgrpd(struct gfs_sbd *sdp, uint64_t blk)
{
struct list_head *tmp, *head;
struct gfs_rgrpd *rgd = NULL;
struct gfs_rindex *ri;
spin_lock(&sdp->sd_rg_mru_lock);
for (head = &sdp->sd_rg_mru_list, tmp = head->next;
tmp != head;
tmp = tmp->next) {
rgd = list_entry(tmp, struct gfs_rgrpd, rd_list_mru);
ri = &rgd->rd_ri;
if (ri->ri_data1 <= blk && blk < ri->ri_data1 + ri->ri_data) {
list_move(&rgd->rd_list_mru, &sdp->sd_rg_mru_list);
spin_unlock(&sdp->sd_rg_mru_lock);
return rgd;
}
}
spin_unlock(&sdp->sd_rg_mru_lock);
return NULL;
}
/**
* gfs_rgrpd_get_first - get the first Resource Group in the filesystem
* @sdp: The GFS superblock
*
* Returns: The first rgrp in the filesystem
*/
struct gfs_rgrpd *
gfs_rgrpd_get_first(struct gfs_sbd *sdp)
{
gfs_assert(sdp, !list_empty(&sdp->sd_rglist),);
return list_entry(sdp->sd_rglist.next, struct gfs_rgrpd, rd_list);
}
/**
* gfs_rgrpd_get_next - get the next RG
* @rgd: A RG
*
* Returns: The next rgrp
*/
struct gfs_rgrpd *
gfs_rgrpd_get_next(struct gfs_rgrpd *rgd)
{
if (rgd->rd_list.next == &rgd->rd_sbd->sd_rglist)
return NULL;
return list_entry(rgd->rd_list.next, struct gfs_rgrpd, rd_list);
}
/**
* clear_rgrpdi - Clear up rgrps
* @sdp: The GFS superblock
*
*/
void
clear_rgrpdi(struct gfs_sbd *sdp)
{
struct gfs_rgrpd *rgd;
struct gfs_glock *gl;
spin_lock(&sdp->sd_rg_forward_lock);
sdp->sd_rg_forward = NULL;
spin_unlock(&sdp->sd_rg_forward_lock);
spin_lock(&sdp->sd_rg_recent_lock);
while (!list_empty(&sdp->sd_rg_recent)) {
rgd = list_entry(sdp->sd_rg_recent.next,
struct gfs_rgrpd, rd_recent);
list_del(&rgd->rd_recent);
}
spin_unlock(&sdp->sd_rg_recent_lock);
while (!list_empty(&sdp->sd_rglist)) {
rgd = list_entry(sdp->sd_rglist.next,
struct gfs_rgrpd, rd_list);
gl = rgd->rd_gl;
list_del(&rgd->rd_list);
list_del(&rgd->rd_list_mru);
if (gl) {
gfs_glock_force_drop(gl);
if (atomic_read(&gl->gl_lvb_count))
gfs_lvb_unhold(gl);
set_gl2rgd(gl, NULL);
gfs_glock_put(gl);
}
if (rgd->rd_bits)
kfree(rgd->rd_bits);
if (rgd->rd_bh)
kfree(rgd->rd_bh);
kfree(rgd);
}
}
/**
* gfs_clear_rgrpd - Clear up rgrps
* @sdp: The GFS superblock
*
*/
void
gfs_clear_rgrpd(struct gfs_sbd *sdp)
{
down(&sdp->sd_rindex_lock);
clear_rgrpdi(sdp);
up(&sdp->sd_rindex_lock);
}
/**
* gfs_compute_bitstructs - Compute the bitmap sizes
* @rgd: The resource group descriptor
*
* Calculates bitmap descriptors, one for each block that contains bitmap data
*
* Returns: errno
*/
static int
compute_bitstructs(struct gfs_rgrpd *rgd)
{
struct gfs_sbd *sdp = rgd->rd_sbd;
struct gfs_bitmap *bits;
uint32_t length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */
uint32_t bytes_left, bytes;
int x;
rgd->rd_bits = kmalloc(length * sizeof(struct gfs_bitmap), GFP_KERNEL);
if (!rgd->rd_bits)
return -ENOMEM;
memset(rgd->rd_bits, 0, length * sizeof(struct gfs_bitmap));
bytes_left = rgd->rd_ri.ri_bitbytes;
for (x = 0; x < length; x++) {
bits = &rgd->rd_bits[x];
/* small rgrp; bitmap stored completely in header block */
if (length == 1) {
bytes = bytes_left;
bits->bi_offset = sizeof(struct gfs_rgrp);
bits->bi_start = 0;
bits->bi_len = bytes;
/* header block */
} else if (x == 0) {
bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs_rgrp);
bits->bi_offset = sizeof(struct gfs_rgrp);
bits->bi_start = 0;
bits->bi_len = bytes;
/* last block */
} else if (x + 1 == length) {
bytes = bytes_left;
bits->bi_offset = sizeof(struct gfs_meta_header);
bits->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
bits->bi_len = bytes;
/* other blocks */
} else {
bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs_meta_header);
bits->bi_offset = sizeof(struct gfs_meta_header);
bits->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
bits->bi_len = bytes;
}
bytes_left -= bytes;
}
if (bytes_left) {
gfs_consist_rgrpd(rgd);
return -EIO;
}
if ((rgd->rd_bits[length - 1].bi_start +
rgd->rd_bits[length - 1].bi_len) * GFS_NBBY !=
rgd->rd_ri.ri_data) {
if (gfs_consist_rgrpd(rgd)) {
gfs_rindex_print(&rgd->rd_ri);
printk("GFS: fsid=%s: start=%u len=%u offset=%u\n",
sdp->sd_fsname,
rgd->rd_bits[length - 1].bi_start,
rgd->rd_bits[length - 1].bi_len,
rgd->rd_bits[length - 1].bi_offset);
}
return -EIO;
}
rgd->rd_bh = kmalloc(length * sizeof(struct buffer_head *), GFP_KERNEL);
if (!rgd->rd_bh) {
kfree(rgd->rd_bits);
rgd->rd_bits = NULL;
return -ENOMEM;
}
memset(rgd->rd_bh, 0, length * sizeof(struct buffer_head *));
return 0;
}
/**
* gfs_ri_update - Pull in a new resource index from the disk
* @gl: The glock covering the rindex inode
*
* Returns: 0 on successful update, error code otherwise
*/
static int
gfs_ri_update(struct gfs_inode *ip)
{
struct gfs_sbd *sdp = ip->i_sbd;
struct gfs_rgrpd *rgd;
char buf[sizeof(struct gfs_rindex)];
int error;
if (do_mod(ip->i_di.di_size, sizeof(struct gfs_rindex))) {
gfs_consist_inode(ip);
return -EIO;
}
clear_rgrpdi(sdp);
for (sdp->sd_rgcount = 0;; sdp->sd_rgcount++) {
error = gfs_internal_read(ip, buf,
sdp->sd_rgcount *
sizeof(struct gfs_rindex),
sizeof(struct gfs_rindex));
if (!error)
break;
if (error != sizeof(struct gfs_rindex)) {
if (error > 0)
error = -EIO;
goto fail;
}
rgd = kmalloc(sizeof(struct gfs_rgrpd), GFP_KERNEL);
error = -ENOMEM;
if (!rgd)
goto fail;
memset(rgd, 0, sizeof(struct gfs_rgrpd));
INIT_LIST_HEAD(&rgd->rd_mhc);
INIT_LIST_HEAD(&rgd->rd_depend);
rgd->rd_sbd = sdp;
list_add_tail(&rgd->rd_list, &sdp->sd_rglist);
list_add_tail(&rgd->rd_list_mru, &sdp->sd_rg_mru_list);
gfs_rindex_in(&rgd->rd_ri, buf);
error = compute_bitstructs(rgd);
if (error)
goto fail;
error = gfs_glock_get(sdp, rgd->rd_ri.ri_addr, &gfs_rgrp_glops,
CREATE, &rgd->rd_gl);
if (error)
goto fail;
error = gfs_lvb_hold(rgd->rd_gl);
if (error)
goto fail;
set_gl2rgd(rgd->rd_gl, rgd);
rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
}
sdp->sd_riinode_vn = ip->i_gl->gl_vn;
return 0;
fail:
clear_rgrpdi(sdp);
return error;
}
/**
* gfs_rindex_hold - Grab a lock on the rindex
* @sdp: The GFS superblock
* @ri_gh: the glock holder
*
* We grab a lock on the rindex inode to make sure that it doesn't
* change whilst we are performing an operation. We keep this lock
* for quite long periods of time compared to other locks. This
* doesn't matter, since it is shared and it is very, very rarely
* accessed in the exclusive mode (i.e. only when expanding the filesystem).
*
* This makes sure that we're using the latest copy of the resource index
* special file, which might have been updated if someone expanded the
* filesystem (via gfs_grow utility), which adds new resource groups.
*
* Returns: 0 on success, error code otherwise
*/
int
gfs_rindex_hold(struct gfs_sbd *sdp, struct gfs_holder *ri_gh)
{
struct gfs_inode *ip = sdp->sd_riinode;
struct gfs_glock *gl = ip->i_gl;
int error;
error = gfs_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
if (error)
return error;
/* Read new copy from disk if we don't have the latest */
if (sdp->sd_riinode_vn != gl->gl_vn) {
down(&sdp->sd_rindex_lock);
if (sdp->sd_riinode_vn != gl->gl_vn) {
error = gfs_ri_update(ip);
if (error)
gfs_glock_dq_uninit(ri_gh);
}
up(&sdp->sd_rindex_lock);
}
return error;
}
/**
* gfs_rgrp_read - Read in a RG's header and bitmaps
* @rgd: the struct gfs_rgrpd describing the RG to read in
*
* Read in all of a Resource Group's header and bitmap blocks.
* Caller must eventually call gfs_rgrp_relse() to free the bitmaps.
*
* Returns: errno
*/
int
gfs_rgrp_read(struct gfs_rgrpd *rgd)
{
struct gfs_sbd *sdp = rgd->rd_sbd;
struct gfs_glock *gl = rgd->rd_gl;
unsigned int x, length = rgd->rd_ri.ri_length;
int error;
for (x = 0; x < length; x++) {
gfs_assert_warn(sdp, !rgd->rd_bh[x]);
rgd->rd_bh[x] = gfs_dgetblk(gl, rgd->rd_ri.ri_addr + x);
}
for (x = 0; x < length; x++) {
error = gfs_dreread(sdp, rgd->rd_bh[x], DIO_START);
if (error)
goto fail;
}
for (x = length; x--;) {
error = gfs_dreread(sdp, rgd->rd_bh[x], DIO_WAIT);
if (error)
goto fail;
if (gfs_metatype_check(sdp, rgd->rd_bh[x],
(x) ? GFS_METATYPE_RB : GFS_METATYPE_RG)) {
error = -EIO;
goto fail;
}
}
if (rgd->rd_rg_vn != gl->gl_vn) {
gfs_rgrp_in(&rgd->rd_rg, (rgd->rd_bh[0])->b_data);
rgd->rd_rg_vn = gl->gl_vn;
}
return 0;
fail:
for (x = 0; x < length; x++) {
brelse(rgd->rd_bh[x]);
rgd->rd_bh[x] = NULL;
}
return error;
}
/**
* gfs_rgrp_relse - Release RG bitmaps read in with gfs_rgrp_read()
* @rgd: the struct gfs_rgrpd describing the RG to read in
*
*/
void
gfs_rgrp_relse(struct gfs_rgrpd *rgd)
{
int x, length = rgd->rd_ri.ri_length;
for (x = 0; x < length; x++) {
brelse(rgd->rd_bh[x]);
rgd->rd_bh[x] = NULL;
}
}
/**
* gfs_rgrp_lvb_fill - copy RG usage data out of the struct gfs_rgrp into the struct gfs_rgrp_lvb
* @rgd: the resource group data structure
*
*/
void
gfs_rgrp_lvb_fill(struct gfs_rgrpd *rgd)
{
struct gfs_rgrp *rg = &rgd->rd_rg;
struct gfs_rgrp_lvb *rb = (struct gfs_rgrp_lvb *)rgd->rd_gl->gl_lvb;
rb->rb_magic = cpu_to_gfs32(GFS_MAGIC);
rb->rb_free = cpu_to_gfs32(rg->rg_free);
rb->rb_useddi = cpu_to_gfs32(rg->rg_useddi);
rb->rb_freedi = cpu_to_gfs32(rg->rg_freedi);
rb->rb_usedmeta = cpu_to_gfs32(rg->rg_usedmeta);
rb->rb_freemeta = cpu_to_gfs32(rg->rg_freemeta);
}
/**
* gfs_rgrp_lvb_init - Init the data of a RG LVB
* @rgd: the resource group data structure
*
* Returns: errno
*/
int
gfs_rgrp_lvb_init(struct gfs_rgrpd *rgd)
{
struct gfs_glock *gl = rgd->rd_gl;
struct gfs_holder rgd_gh;
int error;
error = gfs_glock_nq_init(gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
if (!error) {
gfs_rgrp_lvb_fill(rgd);
gfs_glock_dq_uninit(&rgd_gh);
}
return error;
}
/**
* gfs_alloc_get - allocate a struct gfs_alloc structure for an inode
* @ip: the incore GFS inode structure
*
* Alloc and zero an in-place reservation structure,
* and attach it to the GFS incore inode.
*
* FIXME: Don't use gmalloc()
*
* Returns: the struct gfs_alloc
*/
struct gfs_alloc *
gfs_alloc_get(struct gfs_inode *ip)
{
struct gfs_alloc *al = ip->i_alloc;
gfs_assert_warn(ip->i_sbd, !al);
al = gmalloc(sizeof(struct gfs_alloc));
memset(al, 0, sizeof(struct gfs_alloc));
ip->i_alloc = al;
return al;
}
/**
* gfs_alloc_put - throw away the struct gfs_alloc for an inode
* @ip: the inode
*
*/
void
gfs_alloc_put(struct gfs_inode *ip)
{
struct gfs_alloc *al = ip->i_alloc;
if (gfs_assert_warn(ip->i_sbd, al))
return;
ip->i_alloc = NULL;
kfree(al);
}
/**
* try_rgrp_fit - See if a given reservation will fit in a given RG
* @rgd: the RG data
* @al: the struct gfs_alloc structure describing the reservation
*
* If there's room for the requested blocks to be allocated from the RG:
* Sets the $al_reserved_data field in @al.
* Sets the $al_reserved_meta field in @al.
* Sets the $al_rgd field in @al.
*
* Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
*/
static int
try_rgrp_fit(struct gfs_rgrpd *rgd, struct gfs_alloc *al)
{
uint32_t freeblks = rgd->rd_rg.rg_free;
uint32_t freemeta = rgd->rd_rg.rg_freemeta;
uint32_t metares = al->al_requested_meta;
uint32_t datares = al->al_requested_data;
/* First take care of the data blocks required */
if (freeblks < al->al_requested_data)
return 0;
freeblks -= al->al_requested_data;
/* Then take care of the dinodes */
metares += al->al_requested_di;
/* Then take care of the metadata blocks */
while (freemeta < metares) {
if (freeblks < GFS_META_CLUMP)
return 0;
freeblks -= GFS_META_CLUMP;
freemeta += GFS_META_CLUMP;
datares += GFS_META_CLUMP;
}
al->al_rgd = rgd;
al->al_reserved_meta = metares;
al->al_reserved_data = datares;
return 1;
}
/**
* recent_rgrp_first - get first RG from "recent" list
* @sdp: The GFS superblock
* @rglast: address of the rgrp used last
*
* Returns: The first rgrp in the recent list
*/
static struct gfs_rgrpd *
recent_rgrp_first(struct gfs_sbd *sdp, uint64_t rglast)
{
struct list_head *tmp, *head;
struct gfs_rgrpd *rgd = NULL;
spin_lock(&sdp->sd_rg_recent_lock);
if (list_empty(&sdp->sd_rg_recent))
goto out;
if (!rglast)
goto first;
for (head = &sdp->sd_rg_recent, tmp = head->next;
tmp != head;
tmp = tmp->next) {
rgd = list_entry(tmp, struct gfs_rgrpd, rd_recent);
if (rgd->rd_ri.ri_addr == rglast)
goto out;
}
first:
rgd = list_entry(sdp->sd_rg_recent.next, struct gfs_rgrpd, rd_recent);
out:
spin_unlock(&sdp->sd_rg_recent_lock);
return rgd;
}
/**
* recent_rgrp_next - get next RG from "recent" list
* @cur_rgd: current rgrp
* @remove:
*
* Returns: The next rgrp in the recent list
*/
static struct gfs_rgrpd *
recent_rgrp_next(struct gfs_rgrpd *cur_rgd, int remove)
{
struct gfs_sbd *sdp = cur_rgd->rd_sbd;
struct list_head *tmp, *head;
struct gfs_rgrpd *rgd;
spin_lock(&sdp->sd_rg_recent_lock);
for (head = &sdp->sd_rg_recent, tmp = head->next;
tmp != head;
tmp = tmp->next) {
rgd = list_entry(tmp, struct gfs_rgrpd, rd_recent);
if (rgd == cur_rgd) {
if (cur_rgd->rd_recent.next != head)
rgd = list_entry(cur_rgd->rd_recent.next,
struct gfs_rgrpd, rd_recent);
else
rgd = NULL;
if (remove)
list_del(&cur_rgd->rd_recent);
goto out;
}
}
rgd = NULL;
if (!list_empty(head))
rgd = list_entry(head->next, struct gfs_rgrpd, rd_recent);
out:
spin_unlock(&sdp->sd_rg_recent_lock);
return rgd;
}
/**
* recent_rgrp_add - add an RG to tail of "recent" list
* @new_rgd: The rgrp to add
*
* Before adding, make sure that:
* 1) it's not already on the list
* 2) there's still room for more entries
* The capacity limit imposed on the "recent" list is basically a node's "share"
* of rgrps within a cluster, i.e. (total # rgrps) / (# nodes (journals))
*/
static void
recent_rgrp_add(struct gfs_rgrpd *new_rgd)
{
struct gfs_sbd *sdp = new_rgd->rd_sbd;
struct list_head *tmp, *head;
struct gfs_rgrpd *rgd = NULL;
unsigned int count = 0;
unsigned int max = sdp->sd_rgcount / gfs_num_journals(sdp);
spin_lock(&sdp->sd_rg_recent_lock);
for (head = &sdp->sd_rg_recent, tmp = head->next;
tmp != head;
tmp = tmp->next) {
rgd = list_entry(tmp, struct gfs_rgrpd, rd_recent);
if (rgd == new_rgd)
goto out;
if (++count >= max)
goto out;
}
new_rgd->rd_try_counter = 0;
list_add_tail(&new_rgd->rd_recent, &sdp->sd_rg_recent);
out:
spin_unlock(&sdp->sd_rg_recent_lock);
}
/**
* forward_rgrp_get - get an rgrp to try next from full list
* @sdp: The GFS superblock
*
* Returns: The rgrp to try next
*/
static struct gfs_rgrpd *
forward_rgrp_get(struct gfs_sbd *sdp)
{
struct gfs_rgrpd *rgd;
unsigned int journals = gfs_num_journals(sdp);
unsigned int rg = 0, x;
spin_lock(&sdp->sd_rg_forward_lock);
rgd = sdp->sd_rg_forward;
if (!rgd) {
if (sdp->sd_rgcount >= journals)
rg = sdp->sd_rgcount *
sdp->sd_lockstruct.ls_jid /
journals;
for (x = 0, rgd = gfs_rgrpd_get_first(sdp);
x < rg;
x++, rgd = gfs_rgrpd_get_next(rgd))
/* Do Nothing */;
sdp->sd_rg_forward = rgd;
}
spin_unlock(&sdp->sd_rg_forward_lock);
return rgd;
}
/**
* forward_rgrp_set - set the forward rgrp pointer
* @sdp: the filesystem
* @rgd: The new forward rgrp
*
*/
static void
forward_rgrp_set(struct gfs_sbd *sdp, struct gfs_rgrpd *rgd)
{
spin_lock(&sdp->sd_rg_forward_lock);
sdp->sd_rg_forward = rgd;
spin_unlock(&sdp->sd_rg_forward_lock);
}
/**
* get_local_rgrp - Choose and lock a rgrp for allocation
* @ip: the inode to reserve space for
* @rgp: the chosen and locked rgrp
*
* Try to acquire rgrp in way which avoids contending with others.
*
* Returns: errno
*/
static int
get_local_rgrp(struct gfs_inode *ip)
{
struct gfs_sbd *sdp = ip->i_sbd;
struct gfs_rgrpd *rgd, *begin = NULL;
struct gfs_alloc *al = ip->i_alloc;
int flags = LM_FLAG_TRY;
int skipped = 0;
int loops = 0;
int error;
int try_flag;
unsigned int try_threshold = gfs_tune_get(sdp, gt_rgrp_try_threshold);
/* Try recently successful rgrps */
rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
while (rgd) {
try_flag = (rgd->rd_try_counter >= try_threshold) ?
0: LM_FLAG_TRY;
error = gfs_glock_nq_init(rgd->rd_gl,
LM_ST_EXCLUSIVE, try_flag,
&al->al_rgd_gh);
switch (error) {
case 0:
if (try_rgrp_fit(rgd, al)) {
rgd->rd_try_counter = 0;
goto out;
}
gfs_glock_dq_uninit(&al->al_rgd_gh);
rgd = recent_rgrp_next(rgd, TRUE);
break;
case GLR_TRYFAILED:
rgd->rd_try_counter++;
rgd = recent_rgrp_next(rgd, FALSE);
break;
default:
return error;
}
}
/* Go through full list of rgrps */
begin = rgd = forward_rgrp_get(sdp);
for (;;) {
error = gfs_glock_nq_init(rgd->rd_gl,
LM_ST_EXCLUSIVE, flags,
&al->al_rgd_gh);
switch (error) {
case 0:
if (try_rgrp_fit(rgd, al))
goto out;
gfs_glock_dq_uninit(&al->al_rgd_gh);
break;
case GLR_TRYFAILED:
skipped++;
break;
default:
return error;
}
rgd = gfs_rgrpd_get_next(rgd);
if (!rgd)
rgd = gfs_rgrpd_get_first(sdp);
if (rgd == begin) {
if (++loops >= 2 || !skipped)
return -ENOSPC;
flags = 0;
}
}
out:
ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
if (begin) {
recent_rgrp_add(rgd);
rgd = gfs_rgrpd_get_next(rgd);
if (!rgd)
rgd = gfs_rgrpd_get_first(sdp);
forward_rgrp_set(sdp, rgd);
}
return 0;
}
/**
* gfs_inplace_reserve_i - Reserve space in the filesystem
* @ip: the inode to reserve space for
*
* Acquire resource group locks to allow for the maximum allocation
* described by "res".
*
* This should probably become more complex again, but for now, let's go
* for simple (one resource group) reservations.
*
* Returns: errno
*/
int
gfs_inplace_reserve_i(struct gfs_inode *ip,
char *file, unsigned int line)
{
struct gfs_sbd *sdp = ip->i_sbd;
struct gfs_alloc *al = ip->i_alloc;
int error;
if (gfs_assert_warn(sdp,
al->al_requested_di ||
al->al_requested_data ||
al->al_requested_meta))
return -EINVAL;
error = gfs_rindex_hold(sdp, &al->al_ri_gh);
if (error)
return error;
error = get_local_rgrp(ip);
if (error) {
gfs_glock_dq_uninit(&al->al_ri_gh);
return error;
}
gfs_depend_sync(al->al_rgd);
al->al_file = file;
al->al_line = line;
return 0;
}
/**
* gfs_inplace_release - release an inplace reservation
* @ip: the inode the reservation was taken out on
*
* Release a reservation made by gfs_inplace_reserve().
*/
void
gfs_inplace_release(struct gfs_inode *ip)
{
struct gfs_sbd *sdp = ip->i_sbd;
struct gfs_alloc *al = ip->i_alloc;
if (gfs_assert_warn(sdp, al->al_alloced_di <= al->al_requested_di) == -1)
printk("GFS: fsid=%s: al_alloced_di = %u, al_requested_di = %u\n"
"GFS: fsid=%s: al_file = %s, al_line = %u\n",
sdp->sd_fsname, al->al_alloced_di, al->al_requested_di,
sdp->sd_fsname, al->al_file, al->al_line);
if (gfs_assert_warn(sdp, al->al_alloced_meta <= al->al_reserved_meta) == -1)
printk("GFS: fsid=%s: al_alloced_meta = %u, al_reserved_meta = %u\n"
"GFS: fsid=%s: al_file = %s, al_line = %u\n",
sdp->sd_fsname, al->al_alloced_meta, al->al_reserved_meta,
sdp->sd_fsname, al->al_file, al->al_line);
if (gfs_assert_warn(sdp, al->al_alloced_data <= al->al_reserved_data) == -1)
printk("GFS: fsid=%s: al_alloced_data = %u, al_reserved_data = %u\n"
"GFS: fsid=%s: al_file = %s, al_line = %u\n",
sdp->sd_fsname, al->al_alloced_data, al->al_reserved_data,
sdp->sd_fsname, al->al_file, al->al_line);
al->al_rgd = NULL;
gfs_glock_dq_uninit(&al->al_rgd_gh);
gfs_glock_dq_uninit(&al->al_ri_gh);
}
/**
* gfs_get_block_type - Check a block in a RG is of given type
* @rgd: the resource group holding the block
* @block: the block number
*
* Returns: The block type (GFS_BLKST_*)
*/
unsigned char
gfs_get_block_type(struct gfs_rgrpd *rgd, uint64_t block)
{
struct gfs_bitmap *bits = NULL;
uint32_t length, rgrp_block, buf_block;
unsigned int buf;
unsigned char type;
length = rgd->rd_ri.ri_length;
rgrp_block = block - rgd->rd_ri.ri_data1;
for (buf = 0; buf < length; buf++) {
bits = &rgd->rd_bits[buf];
if (rgrp_block < (bits->bi_start + bits->bi_len) * GFS_NBBY)
break;
}
gfs_assert(rgd->rd_sbd, buf < length,);
buf_block = rgrp_block - bits->bi_start * GFS_NBBY;
type = gfs_testbit(rgd,
rgd->rd_bh[buf]->b_data + bits->bi_offset,
bits->bi_len, buf_block);
return type;
}
/**
* blkalloc_internal - find a block in @old_state, change allocation
* state to @new_state
* @rgd: the resource group descriptor
* @goal: the goal block within the RG (start here to search for avail block)
* @old_state: GFS_BLKST_XXX the before-allocation state to find
* @new_state: GFS_BLKST_XXX the after-allocation block state
*
* Walk rgrp's bitmap to find bits that represent a block in @old_state.
* Add the found bitmap buffer to the transaction.
* Set the found bits to @new_state to change block's allocation state.
*
* This function never fails, because we wouldn't call it unless we
* know (from reservation results, etc.) that a block is available.
*
* Scope of @goal and returned block is just within rgrp (32-bit),
* not the whole filesystem (64-bit).
*
* Returns: the block # allocated (32-bit rgrp scope)
*/
static uint32_t
blkalloc_internal(struct gfs_rgrpd *rgd,
uint32_t goal,
unsigned char old_state, unsigned char new_state)
{
struct gfs_bitmap *bits = NULL;
uint32_t length = rgd->rd_ri.ri_length;
uint32_t blk = 0;
unsigned int buf, x;
/* Find bitmap block that contains bits for goal block */
for (buf = 0; buf < length; buf++) {
bits = &rgd->rd_bits[buf];
if (goal < (bits->bi_start + bits->bi_len) * GFS_NBBY)
break;
}
gfs_assert(rgd->rd_sbd, buf < length,);
/* Convert scope of "goal" from rgrp-wide to within found bit block */
goal -= bits->bi_start * GFS_NBBY;
/* Search (up to entire) bitmap in this rgrp for allocatable block.
"x <= length", instead of "x < length", because we typically start
the search in the middle of a bit block, but if we can't find an
allocatable block anywhere else, we want to be able wrap around and
search in the first part of our first-searched bit block. */
for (x = 0; x <= length; x++) {
blk = gfs_bitfit(rgd->rd_bh[buf]->b_data + bits->bi_offset,
bits->bi_len, goal, old_state);
if (blk != BFITNOENT)
break;
/* Try next bitmap block (wrap back to rgrp header if at end) */
buf = (buf + 1) % length;
bits = &rgd->rd_bits[buf];
goal = 0;
}
if (unlikely(x > length)) {
printk("GFS error: possible RG corruption\n");
printk(" please run gfs_fsck after withdraw\n");
dump_stack();
if (gfs_assert_withdraw(rgd->rd_sbd, x <= length))
blk = 0;
}
/* Attach bitmap buffer to trans, modify bits to do block alloc */
gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[buf]);
gfs_setbit(rgd,
rgd->rd_bh[buf]->b_data + bits->bi_offset,
bits->bi_len, blk, new_state);
/* Return allocated block #, rgrp scope (32-bit) */
return bits->bi_start * GFS_NBBY + blk;
}
/**
* blkfree_internal - Change alloc state of given block(s)
* @sdp: the filesystem
* @bstart: first block (64-bit filesystem scope) of a run of contiguous blocks
* @blen: the length of the block run (all must lie within ONE RG!)
* @new_state: GFS_BLKST_XXX the after-allocation block state
*
* Returns: Resource group containing the block(s)
*
* Find rgrp containing @bstart.
* For each block in run:
* Find allocation bitmap buffer.
* Add bitmap buffer to transaction.
* Set bits to new state.
* Typically used to free blocks to GFS_BLKST_FREE or GFS_BLKST_FREEMETA,
* but @new_state can be any GFS_BLKST_XXX
*
*/
static struct gfs_rgrpd *
blkfree_internal(struct gfs_sbd *sdp, uint64_t bstart, uint32_t blen,
unsigned char new_state)
{
struct gfs_rgrpd *rgd;
struct gfs_bitmap *bits = NULL;
uint32_t length, rgrp_blk, buf_blk;
unsigned int buf;
/* Find rgrp */
rgd = gfs_blk2rgrpd(sdp, bstart);
if (!rgd) {
if (gfs_consist(sdp))
printk("GFS: fsid=%s: block = %llu\n",
sdp->sd_fsname, bstart);
return NULL;
}
length = rgd->rd_ri.ri_length;
/* Convert blk # from filesystem scope (64-bit) to RG scope (32-bit) */
rgrp_blk = bstart - rgd->rd_ri.ri_data1;
while (blen--) {
/* Find bitmap buffer for this block */
for (buf = 0; buf < length; buf++) {
bits = &rgd->rd_bits[buf];
if (rgrp_blk < (bits->bi_start + bits->bi_len) * GFS_NBBY)
break;
}
gfs_assert(rgd->rd_sbd, buf < length,);
/* Find bits and set 'em */
buf_blk = rgrp_blk - bits->bi_start * GFS_NBBY;
rgrp_blk++;
gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[buf]);
gfs_setbit(rgd,
rgd->rd_bh[buf]->b_data + bits->bi_offset,
bits->bi_len, buf_blk, new_state);
}
+ if (new_state == GFS_BLKST_FREEMETA)
+ rgd->rd_flags |= RD_FL_META2FREE;
+
return rgd;
}
/**
* clump_alloc - Allocate a clump of metadata blocks
* @rgd: the resource group in which to allocate
* @first: returns the first block allocated
*
* Returns: errno
*
* Bitmap-allocate a clump of metadata blocks
* Write metadata blocks to disk with dummy meta-headers
* Add meta-headers to incore meta-header cache
*/
static int
clump_alloc(struct gfs_rgrpd *rgd, uint32_t *first)
{
struct gfs_sbd *sdp = rgd->rd_sbd;
struct gfs_meta_header mh;
struct buffer_head **bh;
uint32_t goal, blk;
unsigned int x;
int error = 0;
/* Dummy meta-header template */
memset(&mh, 0, sizeof(struct gfs_meta_header));
mh.mh_magic = GFS_MAGIC;
mh.mh_type = GFS_METATYPE_NONE;
/* Array of bh pointers used in several steps */
bh = gmalloc(GFS_META_CLUMP * sizeof(struct buffer_head *));
memset(bh, 0, GFS_META_CLUMP * sizeof(struct buffer_head *));
/* Since we're looking for data blocks to change into meta blocks,
use last alloc'd *data* (not meta) block as start point */
goal = rgd->rd_last_alloc_data;
for (x = 0; x < GFS_META_CLUMP; x++) {
blk = blkalloc_internal(rgd, goal, GFS_BLKST_FREE,
GFS_BLKST_FREEMETA);
if (!x)
*first = blk;
bh[x] = gfs_dgetblk(rgd->rd_gl, rgd->rd_ri.ri_data1 + blk);
gfs_prep_new_buffer(bh[x]);
gfs_meta_header_out(&mh, bh[x]->b_data);
((struct gfs_meta_header *)bh[x]->b_data)->mh_generation = 0;
/* start write of new meta-buffer to disk */
error = gfs_dwrite(sdp, bh[x], DIO_DIRTY | DIO_START);
if (error)
goto out;
goal = blk;
}
/* Block alloc start point for next time */
rgd->rd_last_alloc_data = goal;
/* Wait for all new meta-buffers to get on-disk */
for (x = 0; x < GFS_META_CLUMP; x++) {
error = gfs_dwrite(sdp, bh[x], DIO_WAIT);
if (error)
goto out;
}
/* Add all new meta-headers to meta-header cache */
gfs_mhc_add(rgd, bh, GFS_META_CLUMP);
gfs_assert_withdraw(sdp, rgd->rd_rg.rg_free >= GFS_META_CLUMP);
rgd->rd_rg.rg_free -= GFS_META_CLUMP;
rgd->rd_rg.rg_freemeta += GFS_META_CLUMP;
out:
for (x = 0; x < GFS_META_CLUMP; x++)
if (bh[x]) {
gfs_dwrite(sdp, bh[x], DIO_WAIT);
brelse(bh[x]);
}
kfree(bh);
return error;
}
/**
* gfs_blkalloc - Allocate a data block
* @ip: the inode to allocate the data block for
* @block: the block allocated
*
*/
void
gfs_blkalloc(struct gfs_inode *ip, uint64_t *block)
{
struct gfs_sbd *sdp = ip->i_sbd;
struct gfs_alloc *al = ip->i_alloc;
struct gfs_rgrpd *rgd = al->al_rgd;
uint32_t goal, blk;
int same;
same = (rgd->rd_ri.ri_addr == ip->i_di.di_goal_rgrp);
goal = (same) ? ip->i_di.di_goal_dblk : rgd->rd_last_alloc_data;
blk = blkalloc_internal(rgd, goal,
GFS_BLKST_FREE, GFS_BLKST_USED);
rgd->rd_last_alloc_data = blk;
if (!same) {
ip->i_di.di_goal_rgrp = rgd->rd_ri.ri_addr;
ip->i_di.di_goal_mblk = 0;
}
ip->i_di.di_goal_dblk = blk;
*block = rgd->rd_ri.ri_data1 + blk;
gfs_assert_withdraw(sdp, rgd->rd_rg.rg_free);
rgd->rd_rg.rg_free--;
gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
al->al_alloced_data++;
gfs_trans_add_quota(sdp, +1, ip->i_di.di_uid, ip->i_di.di_gid);
/* total=0, free=-1, dinodes=0 */
gfs_statfs_modify(sdp, 0, -1, 0);
}
/**
* gfs_metaalloc - Allocate a metadata block to a file
* @ip: the file
* @block: the block allocated
*
* Returns: errno
*/
int
gfs_metaalloc(struct gfs_inode *ip, uint64_t *block)
{
struct gfs_sbd *sdp = ip->i_sbd;
struct gfs_alloc *al = ip->i_alloc;
struct gfs_rgrpd *rgd = al->al_rgd;
uint32_t goal, blk;
int same;
int error;
same = (rgd->rd_ri.ri_addr == ip->i_di.di_goal_rgrp);
if (!rgd->rd_rg.rg_freemeta) {
error = clump_alloc(rgd, &goal);
if (error)
return error;
al->al_alloced_data += GFS_META_CLUMP;
} else
goal = (same) ? ip->i_di.di_goal_mblk : rgd->rd_last_alloc_meta;
blk = blkalloc_internal(rgd, goal,
GFS_BLKST_FREEMETA, GFS_BLKST_USEDMETA);
rgd->rd_last_alloc_meta = blk;
if (!same) {
ip->i_di.di_goal_rgrp = rgd->rd_ri.ri_addr;
ip->i_di.di_goal_dblk = 0;
}
ip->i_di.di_goal_mblk = blk;
*block = rgd->rd_ri.ri_data1 + blk;
gfs_assert_withdraw(sdp, rgd->rd_rg.rg_freemeta);
rgd->rd_rg.rg_freemeta--;
rgd->rd_rg.rg_usedmeta++;
gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
al->al_alloced_meta++;
gfs_trans_add_quota(sdp, +1, ip->i_di.di_uid, ip->i_di.di_gid);
/* total=0, free=-1, dinode=0 */
gfs_statfs_modify(sdp, 0, -1, 0);
return 0;
}
/**
* gfs_dialloc - Allocate a dinode
* @dip: the directory that the inode is going in
* @block: the block (result) which this function allocates as the dinode
* (64-bit filesystem scope)
*
* Returns: errno
*/
int
gfs_dialloc(struct gfs_inode *dip, uint64_t *block)
{
struct gfs_sbd *sdp = dip->i_sbd;
struct gfs_alloc *al = dip->i_alloc;
struct gfs_rgrpd *rgd = al->al_rgd;
uint32_t goal, blk;
int error = 0;
if (rgd->rd_rg.rg_freemeta)
/* pick up where we left off last time */
goal = rgd->rd_last_alloc_meta;
else {
/* no free meta blocks, allocate a bunch more */
error = clump_alloc(rgd, &goal);
if (error)
return error;
al->al_alloced_data += GFS_META_CLUMP;
}
/* Alloc the dinode; 32-bit "blk" is block offset within rgrp */
blk = blkalloc_internal(rgd, goal,
GFS_BLKST_FREEMETA, GFS_BLKST_USEDMETA);
/* remember where we left off, for next time */
rgd->rd_last_alloc_meta = blk;
/* convert from rgrp scope (32-bit) to filesystem scope (64-bit) */
*block = rgd->rd_ri.ri_data1 + blk;
gfs_assert_withdraw(rgd->rd_sbd, rgd->rd_rg.rg_freemeta);
rgd->rd_rg.rg_freemeta--;
rgd->rd_rg.rg_useddi++;
/* Attach rgrp header to trans, update freemeta and useddi stats */
gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
/* Update stats in in-place reservation struct */
al->al_alloced_di++;
al->al_alloced_meta++;
/* total=0, free=-1, dinodes=1 */
gfs_statfs_modify(sdp, 0, -1, +1);
return error;
}
/**
* gfs_blkfree - free a contiguous run of data block(s)
* @ip: the inode these blocks are being freed from
* @bstart: first block (64-bit filesystem scope) of a run of contiguous blocks
* @blen: the length of the block run (all must lie within ONE RG!)
*
* Bitmap-deallocate the blocks (to FREE data state), add bitmap blks to trans
* Update rgrp alloc statistics in rgrp header, add rgrp header buf to trans
* Update quotas, add to trans.
*/
void
gfs_blkfree(struct gfs_inode *ip, uint64_t bstart, uint32_t blen)
{
struct gfs_sbd *sdp = ip->i_sbd;
struct gfs_rgrpd *rgd;
rgd = blkfree_internal(sdp, bstart, blen, GFS_BLKST_FREE);
if (!rgd)
return;
rgd->rd_rg.rg_free += blen;
gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
gfs_trans_add_quota(sdp, -(int64_t)blen,
ip->i_di.di_uid,
ip->i_di.di_gid);
/* total=0, free=+blen, dinodes=0 */
gfs_statfs_modify(sdp, 0, blen, 0);
}
/**
* gfs_metafree - free a contiguous run of metadata block(s)
* @ip: the inode these blocks are being freed from
* @bstart: first block (64-bit filesystem scope) of a run of contiguous blocks
* @blen: the length of the block run (all must lie within ONE RG!)
*
* Bitmap-deallocate the blocks (to FREEMETA state), add bitmap blks to trans.
* Update rgrp alloc statistics in rgrp header, add rgrp header to trans.
* Update quotas (quotas include metadata, not just data block usage),
* add to trans.
* Release deallocated buffers, add to meta-header cache (we save these in-core
* so we don't need to re-read meta blocks if/when they are re-alloc'd).
*/
void
gfs_metafree(struct gfs_inode *ip, uint64_t bstart, uint32_t blen)
{
struct gfs_sbd *sdp = ip->i_sbd;
struct gfs_rgrpd *rgd;
rgd = blkfree_internal(sdp, bstart, blen, GFS_BLKST_FREEMETA);
if (!rgd)
return;
if (rgd->rd_rg.rg_usedmeta < blen)
gfs_consist_rgrpd(rgd);
rgd->rd_rg.rg_usedmeta -= blen;
rgd->rd_rg.rg_freemeta += blen;
gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
/* total=0, free=blen, dinode=0 */
gfs_statfs_modify(sdp, 0, blen, 0);
gfs_trans_add_quota(sdp, -(int64_t)blen,
ip->i_di.di_uid,
ip->i_di.di_gid);
gfs_wipe_buffers(ip, rgd, bstart, blen);
}
/**
* gfs_difree_uninit - free a dinode block
* @rgd: the resource group that contains the dinode
* @addr: the dinode address
*
* De-allocate the dinode to FREEMETA using block alloc bitmap.
* Update rgrp's block usage statistics (used dinode--, free meta++).
* Add rgrp header to transaction.
*/
void
gfs_difree_uninit(struct gfs_rgrpd *rgd, uint64_t addr)
{
struct gfs_rgrpd *tmp_rgd;
tmp_rgd = blkfree_internal(rgd->rd_sbd, addr, 1,
GFS_BLKST_FREEMETA);
if (!tmp_rgd)
return;
gfs_assert_withdraw(rgd->rd_sbd, rgd == tmp_rgd);
if (!rgd->rd_rg.rg_useddi)
gfs_consist_rgrpd(rgd);
rgd->rd_rg.rg_useddi--;
rgd->rd_rg.rg_freemeta++;
gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
gfs_rgrp_out(&rgd->rd_rg, rgd->rd_bh[0]->b_data);
/* total=0, free=1, dinodes=-1 */
gfs_statfs_modify(rgd->rd_sbd, 0, +1, -1);
}
/**
* gfs_difree - free a dinode block
* @rgd: the resource group that contains the dinode
* @ip: the inode representing the dinode to free
*
* Free the dinode block to FREEMETA, update rgrp's block usage stats.
* Update quotas (quotas include metadata, not just data block usage),
* add to trans.
* Release deallocated buffers, add to meta-header cache (we save these in-core
* so we don't need to re-read meta blocks if/when they are re-alloc'd).
*/
void
gfs_difree(struct gfs_rgrpd *rgd, struct gfs_inode *ip)
{
gfs_difree_uninit(rgd, ip->i_num.no_addr);
gfs_trans_add_quota(ip->i_sbd, -1, ip->i_di.di_uid, ip->i_di.di_gid);
gfs_wipe_buffers(ip, rgd, ip->i_num.no_addr, 1);
}
/**
* gfs_rlist_add - add a RG to a list of RGs
* @sdp: the filesystem
* @rlist: the list of resource groups
* @block: the block
*
* Figure out what RG a block belongs to and add that RG to the list
*
* FIXME: Don't use gmalloc()
*
*/
void
gfs_rlist_add(struct gfs_sbd *sdp, struct gfs_rgrp_list *rlist, uint64_t block)
{
struct gfs_rgrpd *rgd;
struct gfs_rgrpd **tmp;
unsigned int new_space;
unsigned int x;
if (gfs_assert_warn(sdp, !rlist->rl_ghs))
return;
rgd = gfs_blk2rgrpd(sdp, block);
if (!rgd) {
if (gfs_consist(sdp))
printk("GFS: fsid=%s: block = %llu\n",
sdp->sd_fsname, block);
return;
}
for (x = 0; x < rlist->rl_rgrps; x++)
if (rlist->rl_rgd[x] == rgd)
return;
if (rlist->rl_rgrps == rlist->rl_space) {
new_space = rlist->rl_space + 10;
tmp = gmalloc(new_space * sizeof(struct gfs_rgrpd *));
if (rlist->rl_rgd) {
memcpy(tmp, rlist->rl_rgd,
rlist->rl_space * sizeof(struct gfs_rgrpd *));
kfree(rlist->rl_rgd);
}
rlist->rl_space = new_space;
rlist->rl_rgd = tmp;
}
rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
}
/**
* gfs_rlist_alloc - all RGs have been added to the rlist, now allocate
* and initialize an array of glock holders for them
* @rlist: the list of resource groups
* @state: the lock state to acquire the RG lock in
* @flags: the modifier flags for the holder structures
*
* FIXME: Don't use gmalloc()
*
*/
void
gfs_rlist_alloc(struct gfs_rgrp_list *rlist, unsigned int state, int flags)
{
unsigned int x;
rlist->rl_ghs = gmalloc(rlist->rl_rgrps * sizeof(struct gfs_holder));
for (x = 0; x < rlist->rl_rgrps; x++)
gfs_holder_init(rlist->rl_rgd[x]->rd_gl,
state, flags,
&rlist->rl_ghs[x]);
}
/**
* gfs_rlist_free - free a resource group list
* @list: the list of resource groups
*
*/
void
gfs_rlist_free(struct gfs_rgrp_list *rlist)
{
unsigned int x;
if (rlist->rl_rgd)
kfree(rlist->rl_rgd);
if (rlist->rl_ghs) {
for (x = 0; x < rlist->rl_rgrps; x++)
gfs_holder_uninit(&rlist->rl_ghs[x]);
kfree(rlist->rl_ghs);
}
}
/**
* gfs_reclaim_metadata - reclaims unused metadata
* @sdp: the file system
* @inodes:
* @metadata:
*
* This function will look through the resource groups and
* free the unused metadata.
*
* Returns: errno
*/
int
gfs_reclaim_metadata(struct gfs_sbd *sdp,
uint64_t *inodes,
- uint64_t *metadata)
+ uint64_t *metadata,
+ uint32_t rg_max)
{
struct gfs_holder ji_gh, ri_gh, rgd_gh, t_gh;
struct gfs_rgrpd *rgd;
struct gfs_rgrp *rg;
struct gfs_dinode *di;
struct gfs_inum next;
struct buffer_head *bh;
uint32_t flags;
uint32_t goal;
unsigned int x;
int error = 0;
+ uint64_t rg_count = 0;
+ rg_max = rg_max > sdp->sd_rgcount ? sdp->sd_rgcount : rg_max;
*inodes = *metadata = 0;
/* Acquire the jindex lock here so we don't deadlock with a
process writing the the jindex inode. :-( */
-
error = gfs_jindex_hold(sdp, &ji_gh);
if (error)
goto fail;
error = gfs_rindex_hold(sdp, &ri_gh);
if (error)
goto fail_jindex_relse;
for (rgd = gfs_rgrpd_get_first(sdp);
- rgd;
+ rgd && rg_count < rg_max;
rgd = gfs_rgrpd_get_next(rgd)) {
+ if (!(rgd->rd_flags & RD_FL_META2FREE))
+ continue;
+
error = gfs_glock_nq_init(rgd->rd_gl,
LM_ST_EXCLUSIVE, GL_NOCACHE,
&rgd_gh);
if (error)
goto fail_rindex_relse;
rgrp_verify(rgd);
rg = &rgd->rd_rg;
if (!rg->rg_freedi && !rg->rg_freemeta) {
gfs_glock_dq_uninit(&rgd_gh);
+ rgd->rd_flags &= ~RD_FL_META2FREE;
continue;
}
gfs_mhc_zap(rgd);
gfs_depend_sync(rgd);
error = gfs_lock_fs_check_clean(sdp, LM_ST_EXCLUSIVE, &t_gh);
if (error)
goto fail_gunlock_rg;
error = gfs_trans_begin(sdp, rgd->rd_ri.ri_length, 0);
if (error)
goto fail_unlock_fs;
next = rg->rg_freedi_list;
for (x = rg->rg_freedi; x--;) {
if (!next.no_formal_ino || !next.no_addr) {
gfs_consist_rgrpd(rgd);
error = -EIO;
goto fail_end_trans;
}
blkfree_internal(sdp, next.no_addr, 1, GFS_BLKST_FREE);
error = gfs_dread(rgd->rd_gl, next.no_addr,
DIO_FORCE | DIO_START | DIO_WAIT, &bh);
if (error)
goto fail_end_trans;
di = (struct gfs_dinode *)bh->b_data;
flags = di->di_flags;
flags = gfs32_to_cpu(flags);
if (!(flags & GFS_DIF_UNUSED)) {
gfs_consist_rgrpd(rgd);
brelse(bh);
error = -EIO;
goto fail_end_trans;
}
gfs_inum_in(&next, (char *)&di->di_next_unused);
brelse(bh);
rg->rg_freedi--;
rg->rg_free++;
(*inodes)++;
}
if (next.no_formal_ino || next.no_addr) {
gfs_consist_rgrpd(rgd);
error = -EIO;
goto fail_end_trans;
}
rg->rg_freedi_list = next;
goal = 0;
for (x = rg->rg_freemeta; x--;) {
goal = blkalloc_internal(rgd, goal,
GFS_BLKST_FREEMETA, GFS_BLKST_FREE);
rg->rg_freemeta--;
rg->rg_free++;
(*metadata)++;
}
gfs_trans_add_bh(rgd->rd_gl, rgd->rd_bh[0]);
gfs_rgrp_out(rg, rgd->rd_bh[0]->b_data);
gfs_trans_end(sdp);
gfs_glock_dq_uninit(&t_gh);
+ rgd->rd_flags &= ~RD_FL_META2FREE;
+ rg_count++;
+
gfs_glock_dq_uninit(&rgd_gh);
}
gfs_glock_dq_uninit(&ri_gh);
gfs_glock_dq_uninit(&ji_gh);
return 0;
fail_end_trans:
gfs_trans_end(sdp);
fail_unlock_fs:
gfs_glock_dq_uninit(&t_gh);
fail_gunlock_rg:
gfs_glock_dq_uninit(&rgd_gh);
fail_rindex_relse:
gfs_glock_dq_uninit(&ri_gh);
fail_jindex_relse:
gfs_glock_dq_uninit(&ji_gh);
fail:
return error;
}
diff --git a/gfs-kernel/src/gfs/rgrp.h b/gfs-kernel/src/gfs/rgrp.h
index 0f5c62079..c10a9e01e 100644
--- a/gfs-kernel/src/gfs/rgrp.h
+++ b/gfs-kernel/src/gfs/rgrp.h
@@ -1,75 +1,76 @@
#ifndef __RGRP_DOT_H__
#define __RGRP_DOT_H__
void gfs_mhc_add(struct gfs_rgrpd *rgd, struct buffer_head **bh,
unsigned int num);
int gfs_mhc_fish(struct gfs_sbd *sdp, struct buffer_head *bh);
void gfs_mhc_zap(struct gfs_rgrpd *rgd);
void gfs_depend_add(struct gfs_rgrpd *rgd, uint64_t formal_ino);
void gfs_depend_sync(struct gfs_rgrpd *rgd);
struct gfs_rgrpd *gfs_blk2rgrpd(struct gfs_sbd *sdp, uint64_t blk);
struct gfs_rgrpd *gfs_rgrpd_get_first(struct gfs_sbd *sdp);
struct gfs_rgrpd *gfs_rgrpd_get_next(struct gfs_rgrpd *rgd);
void gfs_clear_rgrpd(struct gfs_sbd *sdp);
int gfs_rindex_hold(struct gfs_sbd *sdp, struct gfs_holder *ri_gh);
int gfs_rgrp_read(struct gfs_rgrpd *rgd);
void gfs_rgrp_relse(struct gfs_rgrpd *rgd);
void gfs_rgrp_lvb_fill(struct gfs_rgrpd *rgd);
int gfs_rgrp_lvb_init(struct gfs_rgrpd *rgd);
struct gfs_alloc *gfs_alloc_get(struct gfs_inode *ip);
void gfs_alloc_put(struct gfs_inode *ip);
int gfs_inplace_reserve_i(struct gfs_inode *ip,
char *file, unsigned int line);
#define gfs_inplace_reserve(ip) \
gfs_inplace_reserve_i((ip), __FILE__, __LINE__)
void gfs_inplace_release(struct gfs_inode *ip);
unsigned char gfs_get_block_type(struct gfs_rgrpd *rgd, uint64_t block);
void gfs_blkalloc(struct gfs_inode *ip, uint64_t *block);
int gfs_metaalloc(struct gfs_inode *ip, uint64_t *block);
int gfs_dialloc(struct gfs_inode *dip, uint64_t *block);
void gfs_blkfree(struct gfs_inode *ip, uint64_t bstart, uint32_t blen);
void gfs_metafree(struct gfs_inode *ip, uint64_t bstart, uint32_t blen);
void gfs_difree_uninit(struct gfs_rgrpd *rgd, uint64_t addr);
void gfs_difree(struct gfs_rgrpd *rgd, struct gfs_inode *ip);
extern void gfs_statfs_modify(struct gfs_sbd *sdp,
int64_t total,
int64_t free,
int64_t dinodes);
/*
* gfs_rgrp_list
*
* Used to collect a list of all resource groups spanned by a given
* inode/file/directory
*/
struct gfs_rgrp_list {
unsigned int rl_rgrps; /* # (qty) of rgrps in list (array) */
unsigned int rl_space; /* Current capacity in list for rgrps */
struct gfs_rgrpd **rl_rgd; /* Array of ptrs to rgrp descriptors */
struct gfs_holder *rl_ghs; /* Array of glock holders for rgrps */
};
void gfs_rlist_add(struct gfs_sbd *sdp, struct gfs_rgrp_list *rlist,
uint64_t block);
void gfs_rlist_alloc(struct gfs_rgrp_list *rlist, unsigned int state,
int flags);
void gfs_rlist_free(struct gfs_rgrp_list *rlist);
int gfs_reclaim_metadata(struct gfs_sbd *sdp,
uint64_t *inodes,
- uint64_t *metadata);
+ uint64_t *metadata,
+ uint32_t rg_max);
#endif /* __RGRP_DOT_H__ */
diff --git a/gfs-kernel/src/gfs/super.c b/gfs-kernel/src/gfs/super.c
index 781350e0b..86f2cfad2 100644
--- a/gfs-kernel/src/gfs/super.c
+++ b/gfs-kernel/src/gfs/super.c
@@ -1,1274 +1,1275 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/spinlock.h>
#include <linux/semaphore.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
#include <linux/vmalloc.h>
#include <linux/statfs.h>
#include "gfs.h"
#include "dio.h"
#include "file.h"
#include "format.h"
#include "glock.h"
#include "glops.h"
#include "inode.h"
#include "log.h"
#include "quota.h"
#include "recovery.h"
#include "rgrp.h"
#include "super.h"
#include "unlinked.h"
#include "trans.h"
/**
* gfs_tune_init - Fill a gfs_tune structure with default values
* @gt: tune
*
*/
void
gfs_tune_init(struct gfs_tune *gt)
{
spin_lock_init(&gt->gt_spin);
gt->gt_ilimit1 = 100;
gt->gt_ilimit1_tries = 3;
gt->gt_ilimit1_min = 1;
gt->gt_ilimit2 = 500;
gt->gt_ilimit2_tries = 10;
gt->gt_ilimit2_min = 3;
gt->gt_demote_secs = 300;
gt->gt_incore_log_blocks = 1024;
gt->gt_jindex_refresh_secs = 60;
gt->gt_depend_secs = 60;
gt->gt_scand_secs = 5;
gt->gt_recoverd_secs = 60;
gt->gt_logd_secs = 1;
gt->gt_quotad_secs = 5;
gt->gt_inoded_secs = 15;
gt->gt_glock_purge = 0;
gt->gt_quota_simul_sync = 64;
gt->gt_quota_warn_period = 10;
gt->gt_atime_quantum = 3600;
gt->gt_quota_quantum = 60;
gt->gt_quota_scale_num = 1;
gt->gt_quota_scale_den = 1;
gt->gt_quota_enforce = 1;
gt->gt_quota_account = 1;
gt->gt_new_files_jdata = 0;
gt->gt_new_files_directio = 0;
gt->gt_max_atomic_write = 4 << 20;
gt->gt_max_readahead = 1 << 18;
gt->gt_lockdump_size = 131072;
gt->gt_stall_secs = 600;
gt->gt_complain_secs = 10;
gt->gt_reclaim_limit = 5000;
gt->gt_entries_per_readdir = 32;
gt->gt_prefetch_secs = 10;
gt->gt_statfs_slots = 64;
gt->gt_max_mhc = 10000;
gt->gt_greedy_default = HZ / 10;
gt->gt_greedy_quantum = HZ / 40;
gt->gt_greedy_max = HZ / 4;
gt->gt_rgrp_try_threshold = 100;
gt->gt_statfs_fast = 0;
+ gt->gt_max_rgrp_free_mdata = 5;
}
/**
* gfs_check_sb - Check superblock
* @sdp: the filesystem
* @sb: The superblock
* @silent: Don't print a message if the check fails
*
* Checks the version code of the FS is one that we understand how to
* read and that the sizes of the various on-disk structures have not
* changed.
*/
int
gfs_check_sb(struct gfs_sbd *sdp, struct gfs_sb *sb, int silent)
{
unsigned int x;
if (sb->sb_header.mh_magic != GFS_MAGIC ||
sb->sb_header.mh_type != GFS_METATYPE_SB) {
if (!silent)
printk("GFS: not a GFS filesystem\n");
return -EINVAL;
}
/* If format numbers match exactly, we're done. */
if (sb->sb_fs_format == GFS_FORMAT_FS &&
sb->sb_multihost_format == GFS_FORMAT_MULTI)
return 0;
if (sb->sb_fs_format != GFS_FORMAT_FS) {
for (x = 0; gfs_old_fs_formats[x]; x++)
if (gfs_old_fs_formats[x] == sb->sb_fs_format)
break;
if (!gfs_old_fs_formats[x]) {
printk("GFS: code version (%u, %u) is incompatible with ondisk format (%u, %u)\n",
GFS_FORMAT_FS, GFS_FORMAT_MULTI,
sb->sb_fs_format, sb->sb_multihost_format);
printk("GFS: I don't know how to upgrade this FS\n");
return -EINVAL;
}
}
if (sb->sb_multihost_format != GFS_FORMAT_MULTI) {
for (x = 0; gfs_old_multihost_formats[x]; x++)
if (gfs_old_multihost_formats[x] == sb->sb_multihost_format)
break;
if (!gfs_old_multihost_formats[x]) {
printk("GFS: code version (%u, %u) is incompatible with ondisk format (%u, %u)\n",
GFS_FORMAT_FS, GFS_FORMAT_MULTI,
sb->sb_fs_format, sb->sb_multihost_format);
printk("GFS: I don't know how to upgrade this FS\n");
return -EINVAL;
}
}
if (!sdp->sd_args.ar_upgrade) {
printk("GFS: code version (%u, %u) is incompatible with ondisk format (%u, %u)\n",
GFS_FORMAT_FS, GFS_FORMAT_MULTI,
sb->sb_fs_format, sb->sb_multihost_format);
printk("GFS: Use the \"upgrade\" mount option to upgrade the FS\n");
printk("GFS: See the manual for more details\n");
return -EINVAL;
}
return 0;
}
/**
* gfs_read_sb - Read super block
* @sdp: The GFS superblock
* @gl: the glock for the superblock (assumed to be held)
* @silent: Don't print message if mount fails
*
*/
int
gfs_read_sb(struct gfs_sbd *sdp, struct gfs_glock *gl, int silent)
{
struct buffer_head *bh;
uint32_t hash_blocks, ind_blocks, leaf_blocks;
uint32_t tmp_blocks;
unsigned int x;
int error;
error = gfs_dread(gl, GFS_SB_ADDR >> sdp->sd_fsb2bb_shift,
DIO_FORCE | DIO_START | DIO_WAIT, &bh);
if (error) {
if (!silent)
printk("GFS: fsid=%s: can't read superblock\n",
sdp->sd_fsname);
return error;
}
gfs_assert(sdp, sizeof(struct gfs_sb) <= bh->b_size,);
gfs_sb_in(&sdp->sd_sb, bh->b_data);
brelse(bh);
error = gfs_check_sb(sdp, &sdp->sd_sb, silent);
if (error)
return error;
sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
GFS_BASIC_BLOCK_SHIFT;
sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode)) /
sizeof(uint64_t);
sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs_indirect)) /
sizeof(uint64_t);
sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs_meta_header);
sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(uint64_t);
/* Compute maximum reservation required to add a entry to a directory */
hash_blocks = DIV_RU(sizeof(uint64_t) * (1 << GFS_DIR_MAX_DEPTH),
sdp->sd_jbsize);
ind_blocks = 0;
for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
tmp_blocks = DIV_RU(tmp_blocks, sdp->sd_inptrs);
ind_blocks += tmp_blocks;
}
leaf_blocks = 2 + GFS_DIR_MAX_DEPTH;
sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode);
sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
for (x = 2;; x++) {
uint64_t space, d;
uint32_t m;
space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
d = space;
m = do_div(d, sdp->sd_inptrs);
if (d != sdp->sd_heightsize[x - 1] || m)
break;
sdp->sd_heightsize[x] = space;
}
sdp->sd_max_height = x;
gfs_assert(sdp, sdp->sd_max_height <= GFS_MAX_META_HEIGHT,);
sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - sizeof(struct gfs_dinode);
sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
for (x = 2;; x++) {
uint64_t space, d;
uint32_t m;
space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
d = space;
m = do_div(d, sdp->sd_inptrs);
if (d != sdp->sd_jheightsize[x - 1] || m)
break;
sdp->sd_jheightsize[x] = space;
}
sdp->sd_max_jheight = x;
gfs_assert(sdp, sdp->sd_max_jheight <= GFS_MAX_META_HEIGHT,);
return 0;
}
/**
* gfs_do_upgrade - upgrade a filesystem
* @sdp: The GFS superblock
*
*/
int
gfs_do_upgrade(struct gfs_sbd *sdp, struct gfs_glock *sb_gl)
{
struct gfs_holder ji_gh, t_gh, j_gh;
struct gfs_log_header lh;
struct buffer_head *bh;
unsigned int x;
int error;
/* If format numbers match exactly, we're done. */
if (sdp->sd_sb.sb_fs_format == GFS_FORMAT_FS &&
sdp->sd_sb.sb_multihost_format == GFS_FORMAT_MULTI) {
printk("GFS: fsid=%s: no upgrade necessary\n",
sdp->sd_fsname);
sdp->sd_args.ar_upgrade = FALSE;
return 0;
}
error = gfs_jindex_hold(sdp, &ji_gh);
if (error)
goto fail;
error = gfs_glock_nq_init(sdp->sd_trans_gl,
LM_ST_EXCLUSIVE, GL_NOCACHE,
&t_gh);
if (error)
goto fail_ji_relse;
if (test_bit(SDF_ROFS, &sdp->sd_flags)) {
printk("GFS: fsid=%s: can't upgrade: read-only FS\n",
sdp->sd_fsname);
error = -EROFS;
goto fail_gunlock_tr;
}
for (x = 0; x < sdp->sd_journals; x++) {
error = gfs_glock_nq_num(sdp,
sdp->sd_jindex[x].ji_addr,
&gfs_meta_glops, LM_ST_SHARED,
LM_FLAG_TRY | GL_NOCACHE, &j_gh);
switch (error) {
case 0:
break;
case GLR_TRYFAILED:
printk("GFS: fsid=%s: journal %u is busy\n",
sdp->sd_fsname, x);
error = -EBUSY;
default:
goto fail_gunlock_tr;
}
error = gfs_find_jhead(sdp, &sdp->sd_jindex[x],
j_gh.gh_gl, &lh);
gfs_glock_dq_uninit(&j_gh);
if (error)
goto fail_gunlock_tr;
if (!(lh.lh_flags & GFS_LOG_HEAD_UNMOUNT) || lh.lh_last_dump) {
printk("GFS: fsid=%s: journal %u is busy\n",
sdp->sd_fsname, x);
error = -EBUSY;
goto fail_gunlock_tr;
}
}
/* We don't need to journal this change because we're changing
only one sector of one block. We definitely don't want to have
the journaling code running at this point. */
error = gfs_dread(sb_gl, GFS_SB_ADDR >> sdp->sd_fsb2bb_shift,
DIO_START | DIO_WAIT, &bh);
if (error)
goto fail_gunlock_tr;
gfs_sb_in(&sdp->sd_sb, bh->b_data);
error = gfs_check_sb(sdp, &sdp->sd_sb, FALSE);
if (error) {
gfs_consist(sdp);
brelse(bh);
goto fail_gunlock_tr;
}
sdp->sd_sb.sb_fs_format = GFS_FORMAT_FS;
sdp->sd_sb.sb_multihost_format = GFS_FORMAT_MULTI;
gfs_sb_out(&sdp->sd_sb, bh->b_data);
set_bit(GLF_DIRTY, &sb_gl->gl_flags);
error = gfs_dwrite(sdp, bh, DIO_DIRTY | DIO_START | DIO_WAIT);
brelse(bh);
gfs_glock_dq_uninit(&t_gh);
gfs_glock_dq_uninit(&ji_gh);
if (!error) {
printk("GFS: fsid=%s: upgrade successful\n",
sdp->sd_fsname);
sdp->sd_args.ar_upgrade = FALSE;
}
return error;
fail_gunlock_tr:
gfs_glock_dq_uninit(&t_gh);
fail_ji_relse:
gfs_glock_dq_uninit(&ji_gh);
fail:
if (error == -EBUSY)
printk("GFS: fsid=%s: can't upgrade: the FS is still busy or contains dirty journals\n",
sdp->sd_fsname);
else
printk("GFS: fsid=%s: can't upgrade: %d\n",
sdp->sd_fsname, error);
return error;
}
/**
* clear_journalsi - Clear all the journal index information (without locking)
* @sdp: The GFS superblock
*
*/
static void
clear_journalsi(struct gfs_sbd *sdp)
{
if (sdp->sd_jindex) {
kfree(sdp->sd_jindex);
sdp->sd_jindex = NULL;
}
sdp->sd_journals = 0;
}
/**
* gfs_clear_journals - Clear all the journal index information
* @sdp: The GFS superblock
*
*/
void
gfs_clear_journals(struct gfs_sbd *sdp)
{
down(&sdp->sd_jindex_lock);
clear_journalsi(sdp);
up(&sdp->sd_jindex_lock);
}
/**
* gfs_ji_update - Update the journal index information
* @ip: The journal index inode
*
* Returns: errno
*/
static int
gfs_ji_update(struct gfs_inode *ip)
{
struct gfs_sbd *sdp = ip->i_sbd;
char buf[sizeof(struct gfs_jindex)];
unsigned int j;
int error;
if (do_mod(ip->i_di.di_size, sizeof(struct gfs_jindex))) {
gfs_consist_inode(ip);
return -EIO;
}
clear_journalsi(sdp);
sdp->sd_jindex = kmalloc(ip->i_di.di_size, GFP_KERNEL);
if (!sdp->sd_jindex)
return -ENOMEM;
memset(sdp->sd_jindex, 0, ip->i_di.di_size);
for (j = 0;; j++) {
error = gfs_internal_read(ip, buf,
j * sizeof(struct gfs_jindex),
sizeof(struct gfs_jindex));
if (!error)
break;
if (error != sizeof(struct gfs_jindex)) {
if (error > 0)
error = -EIO;
goto fail;
}
gfs_jindex_in(sdp->sd_jindex + j, buf);
}
sdp->sd_journals = j;
sdp->sd_jiinode_vn = ip->i_gl->gl_vn;
return 0;
fail:
clear_journalsi(sdp);
return error;
}
/**
* gfs_jindex_hold - Grab a lock on the jindex
* @sdp: The GFS superblock
* @ji_gh: the holder for the jindex glock
*
* This makes sure that we're using the latest copy of the journal index
* special file (this describes all of the journals for this filesystem),
* which might have been updated if someone added journals
* (via gfs_jadd utility).
*
* This is very similar to the gfs_rindex_hold() function, except that
* in general we hold the jindex lock for longer periods of time and
* we grab it far less frequently (in general) then the rgrp lock.
*
* Returns: errno
*/
int
gfs_jindex_hold(struct gfs_sbd *sdp, struct gfs_holder *ji_gh)
{
struct gfs_inode *ip = sdp->sd_jiinode;
struct gfs_glock *gl = ip->i_gl;
int error;
error = gfs_glock_nq_init(gl, LM_ST_SHARED, 0, ji_gh);
if (error)
return error;
/* Read new copy from disk if we don't have the latest */
if (sdp->sd_jiinode_vn != gl->gl_vn) {
down(&sdp->sd_jindex_lock);
if (sdp->sd_jiinode_vn != gl->gl_vn)
error = gfs_ji_update(ip);
up(&sdp->sd_jindex_lock);
}
if (error)
gfs_glock_dq_uninit(ji_gh);
return error;
}
/**
* gfs_get_jiinode - Read-in the special (hidden) journal index inode
* @sdp: The GFS superblock
*
* Returns: errno
*
* This reads-in just the dinode, not the special file contents that describe
* the journals themselves (see gfs_jindex_hold()).
*/
int
gfs_get_jiinode(struct gfs_sbd *sdp)
{
struct gfs_holder ji_gh;
int error;
error = gfs_glock_nq_num(sdp,
sdp->sd_sb.sb_jindex_di.no_formal_ino,
&gfs_inode_glops,
LM_ST_SHARED, GL_LOCAL_EXCL,
&ji_gh);
if (error)
return error;
error = gfs_inode_get(ji_gh.gh_gl, &sdp->sd_sb.sb_jindex_di,
CREATE, &sdp->sd_jiinode);
if (!error) {
sdp->sd_jiinode_vn = ji_gh.gh_gl->gl_vn - 1;
set_bit(GLF_STICKY, &ji_gh.gh_gl->gl_flags);
}
gfs_glock_dq_uninit(&ji_gh);
return error;
}
/**
* gfs_get_riinode - Read in the special (hidden) resource group index inode
* @sdp: The GFS superblock
*
* Returns: errno
*
* This reads-in just the dinode, not the special file contents that describe
* the resource groups themselves (see gfs_rindex_hold()).
*/
int
gfs_get_riinode(struct gfs_sbd *sdp)
{
struct gfs_holder ri_gh;
int error;
error = gfs_glock_nq_num(sdp,
sdp->sd_sb.sb_rindex_di.no_formal_ino,
&gfs_inode_glops,
LM_ST_SHARED, GL_LOCAL_EXCL,
&ri_gh);
if (error)
return error;
error = gfs_inode_get(ri_gh.gh_gl, &sdp->sd_sb.sb_rindex_di,
CREATE, &sdp->sd_riinode);
if (!error) {
sdp->sd_riinode_vn = ri_gh.gh_gl->gl_vn - 1;
set_bit(GLF_STICKY, &ri_gh.gh_gl->gl_flags);
}
gfs_glock_dq_uninit(&ri_gh);
return error;
}
/**
* gfs_get_rootinode - Read in the filesystem's root inode
* @sdp: The GFS superblock
*
* Returns: errno
*/
int
gfs_get_rootinode(struct gfs_sbd *sdp)
{
struct gfs_holder i_gh;
int error;
error = gfs_glock_nq_num(sdp,
sdp->sd_sb.sb_root_di.no_formal_ino,
&gfs_inode_glops,
LM_ST_SHARED, GL_LOCAL_EXCL,
&i_gh);
if (error)
return error;
error = gfs_inode_get(i_gh.gh_gl, &sdp->sd_sb.sb_root_di,
CREATE, &sdp->sd_rooti);
gfs_glock_dq_uninit(&i_gh);
return error;
}
/**
* gfs_get_qinode - Read in the special (hidden) quota inode
* @sdp: The GFS superblock
*
* If one is not on-disk already, create a new one.
* Does not read in file contents, just the dinode.
*
* Returns: errno
*/
int
gfs_get_qinode(struct gfs_sbd *sdp)
{
struct gfs_holder i_gh;
int error;
/* Create, if not on-disk already */
if (!sdp->sd_sb.sb_quota_di.no_formal_ino) {
error = gfs_alloc_qinode(sdp);
if (error)
return error;
}
error = gfs_glock_nq_num(sdp,
sdp->sd_sb.sb_quota_di.no_formal_ino,
&gfs_inode_glops,
LM_ST_SHARED, GL_LOCAL_EXCL,
&i_gh);
if (error)
return error;
error = gfs_inode_get(i_gh.gh_gl, &sdp->sd_sb.sb_quota_di,
CREATE, &sdp->sd_qinode);
gfs_glock_dq_uninit(&i_gh);
return error;
}
/**
* gfs_get_linode - Read in the special (hidden) license inode
* @sdp: The GFS superblock
*
* If one is not on-disk already, create a new one.
* Does not read in file contents, just the dinode.
*
* Returns: errno
*/
int
gfs_get_linode(struct gfs_sbd *sdp)
{
struct gfs_holder i_gh;
int error;
/* Create, if not on-disk already */
if (!sdp->sd_sb.sb_license_di.no_formal_ino) {
error = gfs_alloc_linode(sdp);
if (error)
return error;
}
error = gfs_glock_nq_num(sdp,
sdp->sd_sb.sb_license_di.no_formal_ino,
&gfs_inode_glops,
LM_ST_SHARED, GL_LOCAL_EXCL,
&i_gh);
if (error)
return error;
/* iopen obtained in via gfs_glock_get(..gfs_iopen_glops) */
error = gfs_inode_get(i_gh.gh_gl, &sdp->sd_sb.sb_license_di,
CREATE, &sdp->sd_linode);
gfs_glock_dq_uninit(&i_gh);
return error;
}
/**
* gfs_make_fs_rw - Turn a Read-Only FS into a Read-Write one
* @sdp: the filesystem
*
* Returns: errno
*/
int
gfs_make_fs_rw(struct gfs_sbd *sdp)
{
struct gfs_glock *j_gl = sdp->sd_journal_gh.gh_gl;
struct gfs_holder t_gh;
struct gfs_log_header head;
int error;
error = gfs_glock_nq_init(sdp->sd_trans_gl,
LM_ST_SHARED,
GL_LOCAL_EXCL | GL_EXACT,
&t_gh);
if (error)
return error;
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
error = gfs_find_jhead(sdp, &sdp->sd_jdesc, j_gl, &head);
if (error)
goto fail;
if (!(head.lh_flags & GFS_LOG_HEAD_UNMOUNT)) {
gfs_consist(sdp);
error = -EIO;
goto fail;
}
/* Initialize some head of the log stuff */
sdp->sd_sequence = head.lh_sequence;
sdp->sd_log_head = head.lh_first + 1;
error = gfs_recover_dump(sdp);
if (error)
goto fail;
set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
clear_bit(SDF_ROFS, &sdp->sd_flags);
set_bit(GLF_DIRTY, &j_gl->gl_flags);
gfs_log_dump(sdp, TRUE);
gfs_glock_dq_uninit(&t_gh);
return 0;
fail:
t_gh.gh_flags |= GL_NOCACHE;
gfs_glock_dq_uninit(&t_gh);
return error;
}
/**
* gfs_make_fs_ro - Turn a Read-Write FS into a Read-Only one
* @sdp: the filesystem
*
* Returns: errno
*/
int
gfs_make_fs_ro(struct gfs_sbd *sdp)
{
struct gfs_holder t_gh;
int error;
error = gfs_glock_nq_init(sdp->sd_trans_gl,
LM_ST_SHARED,
GL_LOCAL_EXCL | GL_EXACT | GL_NOCACHE,
&t_gh);
if (error &&
!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
return error;
gfs_statfs_sync(sdp);
gfs_log_flush(sdp);
gfs_quota_sync(sdp);
gfs_quota_scan(sdp);
gfs_sync_meta(sdp);
gfs_log_dump(sdp, TRUE);
gfs_log_shutdown(sdp);
set_bit(SDF_ROFS, &sdp->sd_flags);
clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
if (t_gh.gh_gl)
gfs_glock_dq_uninit(&t_gh);
gfs_unlinked_cleanup(sdp);
gfs_quota_cleanup(sdp);
return error;
}
/**
* stat_gfs_fill - fill in the sg for a given RG
* @rgd: the RG
* @sg: the sg structure
*
* Returns: 0 on success, -ESTALE if the LVB is invalid
*/
static int
stat_gfs_fill(struct gfs_rgrpd *rgd, struct gfs_stat_gfs *sg)
{
struct gfs_rgrp_lvb *rb = (struct gfs_rgrp_lvb *)rgd->rd_gl->gl_lvb;
if (gfs32_to_cpu(rb->rb_magic) != GFS_MAGIC)
return -ESTALE;
sg->sg_total_blocks += rgd->rd_ri.ri_data;
sg->sg_free += gfs32_to_cpu(rb->rb_free);
sg->sg_used_dinode += gfs32_to_cpu(rb->rb_useddi);
sg->sg_free_dinode += gfs32_to_cpu(rb->rb_freedi);
sg->sg_used_meta += gfs32_to_cpu(rb->rb_usedmeta);
sg->sg_free_meta += gfs32_to_cpu(rb->rb_freemeta);
return 0;
}
/**
* stat_gfs_async - Stat a filesystem using asynchronous locking
* @sdp: the filesystem
* @sg: the sg info that will be returned
* @interruptible: TRUE if we should look for signals.
*
* Any error (other than a signal) will cause this routine to fall back
* to the synchronous version.
*
* FIXME: This really shouldn't busy wait like this.
*
* Returns: errno
*/
static int
stat_gfs_async(struct gfs_sbd *sdp, struct gfs_stat_gfs *sg, int interruptible)
{
struct gfs_rgrpd *rgd_next = gfs_rgrpd_get_first(sdp);
struct gfs_holder *gha, *gh;
unsigned int slots = gfs_tune_get(sdp, gt_statfs_slots);
unsigned int x;
int done;
int error = 0, err;
memset(sg, 0, sizeof(struct gfs_stat_gfs));
gha = vmalloc(slots * sizeof(struct gfs_holder));
if (!gha)
return -ENOMEM;
memset(gha, 0, slots * sizeof(struct gfs_holder));
for (;;) {
done = TRUE;
for (x = 0; x < slots; x++) {
gh = gha + x;
if (gh->gh_gl && gfs_glock_poll(gh)) {
err = gfs_glock_wait(gh);
if (err) {
gfs_holder_uninit(gh);
error = err;
} else {
if (!error)
error = stat_gfs_fill(get_gl2rgd(gh->gh_gl), sg);
gfs_glock_dq_uninit(gh);
}
}
if (gh->gh_gl)
done = FALSE;
else if (rgd_next && !error) {
error = gfs_glock_nq_init(rgd_next->rd_gl,
LM_ST_SHARED,
GL_LOCAL_EXCL | GL_SKIP | GL_ASYNC,
gh);
rgd_next = gfs_rgrpd_get_next(rgd_next);
done = FALSE;
}
if (interruptible && signal_pending(current))
error = -ERESTARTSYS;
}
if (done)
break;
yield();
}
vfree(gha);
return error;
}
/**
* stat_gfs_sync - Stat a filesystem using synchronous locking
* @sdp: the filesystem
* @sg: the sg info that will be returned
* @interruptible: TRUE if we should look for signals.
*
* Returns: errno
*/
static int
stat_gfs_sync(struct gfs_sbd *sdp, struct gfs_stat_gfs *sg, int interruptible)
{
struct gfs_holder rgd_gh;
struct gfs_rgrpd *rgd;
int error;
memset(sg, 0, sizeof(struct gfs_stat_gfs));
for (rgd = gfs_rgrpd_get_first(sdp);
rgd;
rgd = gfs_rgrpd_get_next(rgd)) {
for (;;) {
error = gfs_glock_nq_init(rgd->rd_gl,
LM_ST_SHARED,
GL_LOCAL_EXCL | GL_SKIP,
&rgd_gh);
if (error)
return error;
error = stat_gfs_fill(rgd, sg);
gfs_glock_dq_uninit(&rgd_gh);
if (!error)
break;
error = gfs_rgrp_lvb_init(rgd);
if (error)
return error;
}
if (interruptible && signal_pending(current))
return -ERESTARTSYS;
}
return 0;
}
/**
* gfs_stat_gfs - Do a statfs
* @sdp: the filesystem
* @sg: the sg structure
* @interruptible: Stop if there is a signal pending
*
* Returns: errno
*/
int
gfs_stat_gfs(struct gfs_sbd *sdp, struct gfs_stat_gfs *sg, int interruptible)
{
struct gfs_holder ri_gh;
int error;
error = gfs_rindex_hold(sdp, &ri_gh);
if (error)
return error;
error = stat_gfs_async(sdp, sg, interruptible);
if (error == -ESTALE)
error = stat_gfs_sync(sdp, sg, interruptible);
gfs_glock_dq_uninit(&ri_gh);
return error;
}
/**
* gfs_lock_fs_check_clean - Stop all writes to the FS and check that all journals are clean
* @sdp: the file system
* @state: the state to put the transaction lock into
* @t_gh: the hold on the transaction lock
*
* Returns: errno
*/
int
gfs_lock_fs_check_clean(struct gfs_sbd *sdp, unsigned int state,
struct gfs_holder *t_gh)
{
struct gfs_holder ji_gh, cl_gh;
struct gfs_log_header lh;
unsigned int x;
int error;
error = gfs_jindex_hold(sdp, &ji_gh);
if (error)
return error;
error = gfs_glock_nq_num(sdp,
GFS_CRAP_LOCK, &gfs_meta_glops,
LM_ST_SHARED, GL_NOCACHE,
&cl_gh);
if (error)
goto fail;
error = gfs_glock_nq_init(sdp->sd_trans_gl, state,
LM_FLAG_PRIORITY | GL_EXACT | GL_NOCACHE,
t_gh);
if (error)
goto fail_gunlock_craplock;
for (x = 0; x < sdp->sd_journals; x++) {
error = gfs_find_jhead(sdp, &sdp->sd_jindex[x],
cl_gh.gh_gl, &lh);
if (error)
goto fail_gunlock_trans;
if (!(lh.lh_flags & GFS_LOG_HEAD_UNMOUNT)) {
error = -EBUSY;
goto fail_gunlock_trans;
}
}
gfs_glock_dq_uninit(&cl_gh);
gfs_glock_dq_uninit(&ji_gh);
return 0;
fail_gunlock_trans:
gfs_glock_dq_uninit(t_gh);
fail_gunlock_craplock:
gfs_glock_dq_uninit(&cl_gh);
fail:
gfs_glock_dq_uninit(&ji_gh);
return error;
}
/**
* gfs_freeze_fs - freezes the file system
* @sdp: the file system
*
* This function flushes data and meta data for all machines by
* aquiring the transaction log exclusively. All journals are
* ensured to be in a clean state as well.
*
* Returns: errno
*/
int
gfs_freeze_fs(struct gfs_sbd *sdp)
{
int error = 0;
down(&sdp->sd_freeze_lock);
if (!sdp->sd_freeze_count++) {
error = gfs_lock_fs_check_clean(sdp, LM_ST_DEFERRED,
&sdp->sd_freeze_gh);
if (error)
sdp->sd_freeze_count--;
else
sdp->sd_freeze_gh.gh_owner = NULL;
}
up(&sdp->sd_freeze_lock);
return error;
}
/**
* gfs_unfreeze_fs - unfreezes the file system
* @sdp: the file system
*
* This function allows the file system to proceed by unlocking
* the exclusively held transaction lock. Other GFS nodes are
* now free to acquire the lock shared and go on with their lives.
*
*/
void
gfs_unfreeze_fs(struct gfs_sbd *sdp)
{
down(&sdp->sd_freeze_lock);
if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
gfs_glock_dq_uninit(&sdp->sd_freeze_gh);
up(&sdp->sd_freeze_lock);
}
/*
* Fast statfs implementation - mostly based on GFS2 implementation.
*/
void gfs_statfs_change_in(struct gfs_statfs_change_host *sc, const void *buf)
{
const struct gfs_statfs_change *str = buf;
sc->sc_total = be64_to_cpu(str->sc_total);
sc->sc_free = be64_to_cpu(str->sc_free);
sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
}
void gfs_statfs_change_out(const struct gfs_statfs_change_host *sc, void *buf)
{
struct gfs_statfs_change *str = buf;
str->sc_total = cpu_to_be64(sc->sc_total);
str->sc_free = cpu_to_be64(sc->sc_free);
str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
}
int gfs_statfs_start(struct gfs_sbd *sdp)
{
struct gfs_stat_gfs sg;
struct gfs_inode *m_ip;
struct gfs_statfs_change_host *m_sc = &sdp->sd_statfs_master;
struct gfs_statfs_change_host *l_sc = &sdp->sd_statfs_local;
struct buffer_head *m_bh;
struct gfs_holder gh;
int error;
printk("GFS: fsid=%s: fast statfs start time = %lu\n",
sdp->sd_fsname, get_seconds());
/* created via gfs_get_linode() in fill_super(). */
/* gfs_inode_glops */
m_ip = sdp->sd_linode;
/* get real statistics */
error = gfs_stat_gfs(sdp, &sg, TRUE);
if (error)
return error;
/* make sure the page is refreshed via glock flushing */
error = gfs_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
&gh);
if (error)
goto gfs_statfs_start_out;
error = gfs_get_inode_buffer(m_ip, &m_bh);
if (error)
goto gfs_statfs_start_unlock;
error = gfs_trans_begin(sdp, 1, 0);
if (error)
goto gfs_statfs_start_bh;
spin_lock(&sdp->sd_statfs_spin);
m_sc->sc_total = sg.sg_total_blocks;
m_sc->sc_free = sg.sg_free + sg.sg_free_dinode + sg.sg_free_meta;
m_sc->sc_dinodes = sg.sg_used_dinode;
memset(l_sc, 0, sizeof(struct gfs_statfs_change_host));
spin_unlock(&sdp->sd_statfs_spin);
gfs_trans_add_bh(m_ip->i_gl, m_bh);
gfs_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs_dinode));
gfs_trans_end(sdp);
gfs_statfs_start_bh:
brelse(m_bh);
gfs_statfs_start_unlock:
gfs_glock_dq_uninit(&gh);
gfs_statfs_start_out:
return 0;
}
int gfs_statfs_init(struct gfs_sbd *sdp, int flag)
{
int error;
/* if flag == 0, do we want to turn this off ? */
if (!flag)
return 0;
error = gfs_statfs_start(sdp);
if (error)
printk("GFS: fsid=%s: can't initialize statfs subsystem: %d\n",
sdp->sd_fsname, error);
return error;
}
void gfs_statfs_modify(struct gfs_sbd *sdp,
int64_t total,
int64_t free,
int64_t dinodes)
{
struct gfs_statfs_change_host *l_sc = &sdp->sd_statfs_local;
spin_lock(&sdp->sd_statfs_spin);
l_sc->sc_total += total;
l_sc->sc_free += free;
l_sc->sc_dinodes += dinodes;
spin_unlock(&sdp->sd_statfs_spin);
}
int gfs_statfs_sync(struct gfs_sbd *sdp)
{
struct gfs_inode *m_ip = sdp->sd_linode;
struct gfs_statfs_change_host *m_sc = &sdp->sd_statfs_master;
struct gfs_statfs_change_host *l_sc = &sdp->sd_statfs_local;
struct gfs_holder gh;
struct buffer_head *m_bh;
int error;
error = gfs_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
&gh);
if (error)
return error;
error = gfs_get_inode_buffer(m_ip, &m_bh);
if (error)
goto gfs_statfs_sync_out;
/* if no change, simply return */
spin_lock(&sdp->sd_statfs_spin);
gfs_statfs_change_in(m_sc, m_bh->b_data +
sizeof(struct gfs_dinode));
if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
spin_unlock(&sdp->sd_statfs_spin);
goto out_bh;
}
spin_unlock(&sdp->sd_statfs_spin);
error = gfs_trans_begin(sdp, 1, 0);
if (error)
goto out_bh;
spin_lock(&sdp->sd_statfs_spin);
m_sc->sc_total += l_sc->sc_total;
m_sc->sc_free += l_sc->sc_free;
m_sc->sc_dinodes += l_sc->sc_dinodes;
memset(l_sc, 0, sizeof(struct gfs_statfs_change_host));
spin_unlock(&sdp->sd_statfs_spin);
gfs_trans_add_bh(m_ip->i_gl, m_bh);
gfs_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs_dinode));
gfs_trans_end(sdp);
out_bh:
brelse(m_bh);
gfs_statfs_sync_out:
gfs_glock_dq_uninit(&gh);
return error;
}
int gfs_statfs_fast(struct gfs_sbd *sdp, void *b)
{
struct kstatfs *buf = (struct kstatfs *)b;
struct gfs_statfs_change_host sc, *m_sc = &sdp->sd_statfs_master;
struct gfs_statfs_change_host *l_sc = &sdp->sd_statfs_local;
spin_lock(&sdp->sd_statfs_spin);
sc.sc_total = m_sc->sc_total + l_sc->sc_total;
sc.sc_free = m_sc->sc_free + l_sc->sc_free;
sc.sc_dinodes = m_sc->sc_dinodes + l_sc->sc_dinodes;
spin_unlock(&sdp->sd_statfs_spin);
if (sc.sc_free < 0)
sc.sc_free = 0;
if (sc.sc_free > sc.sc_total)
sc.sc_free = sc.sc_total;
if (sc.sc_dinodes < 0)
sc.sc_dinodes = 0;
/* fill in the statistics */
memset(buf, 0, sizeof(struct kstatfs));
buf->f_type = GFS_MAGIC; buf->f_bsize = sdp->sd_sb.sb_bsize;
buf->f_blocks = sc.sc_total;
buf->f_bfree = sc.sc_free;
buf->f_bavail = sc.sc_free;
buf->f_files = sc.sc_dinodes + sc.sc_free;
buf->f_ffree = sc.sc_free;
buf->f_namelen = GFS_FNAMESIZE;
return 0;
}

File Metadata

Mime Type
text/x-diff
Expires
Sat, Nov 23, 11:08 AM (1 d, 17 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1018655
Default Alt Text
(180 KB)

Event Timeline