summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr.h
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/afr/src/afr.h')
-rw-r--r--xlators/cluster/afr/src/afr.h404
1 files changed, 272 insertions, 132 deletions
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 954e9bb31..21064db58 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -36,6 +27,11 @@
#define AFR_XATTR_PREFIX "trusted.afr"
#define AFR_PATHINFO_HEADER "REPLICATE:"
+#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
+#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
+
+#define AFR_LOCKEE_COUNT_MAX 3
+#define AFR_DOM_COUNT_MAX 3
struct _pump_private;
@@ -66,10 +62,8 @@ typedef enum {
AFR_INODE_SET_READ_CTX = 1,
AFR_INODE_RM_STALE_CHILDREN,
AFR_INODE_SET_OPENDIR_DONE,
- AFR_INODE_SET_SPLIT_BRAIN,
AFR_INODE_GET_READ_CTX,
AFR_INODE_GET_OPENDIR_DONE,
- AFR_INODE_GET_SPLIT_BRAIN,
} afr_inode_op_t;
typedef struct afr_inode_params_ {
@@ -83,29 +77,41 @@ typedef struct afr_inode_params_ {
} u;
} afr_inode_params_t;
+typedef enum afr_spb_state {
+ DONT_KNOW,
+ SPB,
+ NO_SPB
+} afr_spb_state_t;
+
typedef struct afr_inode_ctx_ {
uint64_t masks;
int32_t *fresh_children;//increasing order of latency
+ afr_spb_state_t mdata_spb;
+ afr_spb_state_t data_spb;
+ uint32_t open_fd_count;
} afr_inode_ctx_t;
typedef enum {
NONE,
INDEX,
+ INDEX_TO_BE_HEALED,
FULL,
} afr_crawl_type_t;
typedef struct afr_self_heald_ {
- gf_boolean_t enabled;
- gf_boolean_t iamshd;
- afr_crawl_type_t *pending;
- gf_boolean_t *inprogress;
- afr_child_pos_t *pos;
- time_t *sh_times;
- gf_timer_t **timer;
- eh_t *healed;
- eh_t *heal_failed;
- eh_t *split_brain;
- char *node_uuid;
+ gf_boolean_t enabled;
+ gf_boolean_t iamshd;
+ afr_crawl_type_t *pending;
+ gf_boolean_t *inprogress;
+ afr_child_pos_t *pos;
+ gf_timer_t **timer;
+ eh_t *healed;
+ eh_t *heal_failed;
+ eh_t *split_brain;
+ eh_t **statistics;
+ void **crawl_events;
+ char *node_uuid;
+ int timeout;
} afr_self_heald_t;
typedef struct _afr_private {
@@ -139,6 +145,7 @@ typedef struct _afr_private {
gf_boolean_t entry_change_log; /* on/off */
int read_child; /* read-subvolume */
+ unsigned int hash_mode; /* for when read_child is not set */
int favorite_child; /* subvolume to be preferred in resolving
split-brain cases */
@@ -159,14 +166,48 @@ typedef struct _afr_private {
struct list_head saved_fds; /* list of fds on which locks have succeeded */
gf_boolean_t optimistic_change_log;
gf_boolean_t eager_lock;
+ uint32_t post_op_delay_secs;
unsigned int quorum_count;
char vol_uuid[UUID_SIZE + 1];
int32_t *last_event;
afr_self_heald_t shd;
+ gf_boolean_t choose_local;
+ gf_boolean_t did_discovery;
+ gf_boolean_t readdir_failover;
+ uint64_t sh_readdir_size;
+ gf_boolean_t ensure_durability;
+ char *sh_domain;
} afr_private_t;
+typedef enum {
+ AFR_SELF_HEAL_NOT_ATTEMPTED,
+ AFR_SELF_HEAL_STARTED,
+ AFR_SELF_HEAL_FAILED,
+ AFR_SELF_HEAL_SYNC_BEGIN,
+} afr_self_heal_status;
+
typedef struct {
+ afr_self_heal_status gfid_or_missing_entry_self_heal;
+ afr_self_heal_status metadata_self_heal;
+ afr_self_heal_status data_self_heal;
+ afr_self_heal_status entry_self_heal;
+} afr_sh_status_for_all_type;
+
+typedef enum {
+ AFR_SELF_HEAL_ENTRY,
+ AFR_SELF_HEAL_METADATA,
+ AFR_SELF_HEAL_DATA,
+ AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY,
+ AFR_SELF_HEAL_INVALID = -1,
+} afr_self_heal_type;
+
+typedef enum {
+ AFR_CHECK_ALL,
+ AFR_CHECK_SPECIFIC,
+} afr_sh_fail_check_type;
+
+struct afr_self_heal_ {
/* External interface: These are variables (some optional) that
are set by whoever has triggered self-heal */
@@ -175,6 +216,8 @@ typedef struct {
gf_boolean_t do_entry_self_heal;
gf_boolean_t do_gfid_self_heal;
gf_boolean_t do_missing_entry_self_heal;
+ gf_boolean_t force_confirm_spb; /* Check for split-brains even when
+ self-heal is turned off */
gf_boolean_t forced_merge; /* Is this a self-heal triggered to
forcibly merge the directories? */
@@ -241,9 +284,10 @@ typedef struct {
const char *linkname;
gf_boolean_t entries_skipped;
- int op_failed;
-
+ gf_boolean_t actual_sh_started;
+ gf_boolean_t sync_done;
gf_boolean_t data_lock_held;
+ gf_boolean_t sh_dom_lock_held;
gf_boolean_t eof_reached;
fd_t *healing_fd;
int file_has_holes;
@@ -254,17 +298,22 @@ typedef struct {
uint8_t *checksum;
afr_post_remove_call_t post_remove_call;
- loc_t parent_loc;
+ char *data_sh_info;
+ char *metadata_sh_info;
+ loc_t parent_loc;
call_frame_t *orig_frame;
call_frame_t *old_loop_frame;
gf_boolean_t unwound;
afr_sh_algo_private_t *private;
+ afr_sh_status_for_all_type afr_all_sh_status;
+ afr_self_heal_type sh_type_in_action;
struct afr_sh_algorithm *algo;
afr_lock_cbk_t data_lock_success_handler;
afr_lock_cbk_t data_lock_failure_handler;
+ gf_boolean_t data_lock_block;
int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this);
int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this);
@@ -272,7 +321,9 @@ typedef struct {
void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this);
call_frame_t *sh_frame;
-} afr_self_heal_t;
+};
+
+typedef struct afr_self_heal_ afr_self_heal_t;
typedef enum {
AFR_DATA_TRANSACTION, /* truncate, write, ... */
@@ -334,11 +385,31 @@ afr_index_for_transaction_type (afr_transaction_type type)
return -1; /* make gcc happy */
}
+typedef struct {
+ loc_t loc;
+ char *basename;
+ unsigned char *locked_nodes;
+ int locked_count;
+
+} afr_entry_lockee_t;
+
+int
+afr_entry_lockee_cmp (const void *l1, const void *l2);
+
+typedef struct {
+ char *domain; /* Domain on which inodelk is taken */
+ struct gf_flock flock;
+ unsigned char *locked_nodes;
+ int32_t lock_count;
+} afr_inodelk_t;
typedef struct {
loc_t *lk_loc;
- struct gf_flock lk_flock;
+ int lockee_count;
+ afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
+
+ afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX];
const char *lk_basename;
const char *lower_basename;
const char *higher_basename;
@@ -347,23 +418,22 @@ typedef struct {
unsigned char *locked_nodes;
unsigned char *lower_locked_nodes;
- unsigned char *inode_locked_nodes;
- unsigned char *entry_locked_nodes;
selfheal_lk_type_t selfheal_lk_type;
transaction_lk_type_t transaction_lk_type;
int32_t lock_count;
- int32_t inodelk_lock_count;
int32_t entrylk_lock_count;
uint64_t lock_number;
int32_t lk_call_count;
int32_t lk_expected_count;
+ int32_t lk_attempted_count;
int32_t lock_op_ret;
int32_t lock_op_errno;
afr_lock_cbk_t lock_cbk;
+ char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
} afr_internal_lock_t;
typedef struct _afr_locked_fd {
@@ -371,21 +441,29 @@ typedef struct _afr_locked_fd {
struct list_head list;
} afr_locked_fd_t;
+struct afr_reply {
+ int valid;
+ int32_t op_ret;
+ int32_t op_errno;
+};
+
typedef struct _afr_local {
int uid;
int gid;
unsigned int call_count;
unsigned int success_count;
unsigned int enoent_count;
+ uint32_t open_fd_count;
+ gf_boolean_t update_open_fd_count;
- unsigned int govinda_gOvinda;
+ unsigned int unhealable;
unsigned int read_child_index;
unsigned char read_child_returned;
unsigned int first_up_child;
- pid_t saved_pid;
+ gf_lkowner_t saved_lk_owner;
int32_t op_ret;
int32_t op_errno;
@@ -396,7 +474,6 @@ typedef struct _afr_local {
loc_t newloc;
fd_t *fd;
- unsigned char *fd_open_on;
glusterfs_fop_t fop;
@@ -418,13 +495,25 @@ typedef struct _afr_local {
dict_t *dict;
int optimistic_change_log;
+ gf_boolean_t delayed_post_op;
- gf_boolean_t fop_paused;
- int (*fop_call_continue) (call_frame_t *frame, xlator_t *this);
- /*
- This struct contains the arguments for the "continuation"
- (scheme-like) of fops
+ /* Is the current writev() going to perform a stable write?
+ i.e, is fd->flags or @flags writev param have O_SYNC or
+ O_DSYNC?
+ */
+ gf_boolean_t stable_write;
+
+ /* This write appended to the file. Nnot necessarily O_APPEND,
+ just means the offset of write was at the end of file.
+ */
+ gf_boolean_t append_write;
+
+ int allow_sh_for_running_transaction;
+
+
+ /* This struct contains the arguments for the "continuation"
+ (scheme-like) of fops
*/
int op;
@@ -435,6 +524,7 @@ typedef struct _afr_local {
} statfs;
struct {
+ uint32_t parent_entrylk;
uuid_t gfid_req;
inode_t *inode;
struct iatt buf;
@@ -446,7 +536,9 @@ typedef struct _afr_local {
int32_t read_child;
int32_t *sources;
int32_t *success_children;
+ int32_t **pending_matrix;
gf_boolean_t fresh_lookup;
+ gf_boolean_t possible_spb;
} lookup;
struct {
@@ -517,7 +609,9 @@ typedef struct _afr_local {
struct {
struct iatt prebuf;
struct iatt postbuf;
+ } inode_wfop; //common structure for all inode-write-fops
+ struct {
int32_t op_ret;
struct iovec *vector;
@@ -528,34 +622,21 @@ typedef struct _afr_local {
} writev;
struct {
- struct iatt prebuf;
- struct iatt postbuf;
- } fsync;
-
- struct {
off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
} truncate;
struct {
off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
} ftruncate;
struct {
struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
} setattr;
struct {
struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
} fsetattr;
struct {
@@ -583,90 +664,66 @@ typedef struct _afr_local {
/* dir write */
struct {
- fd_t *fd;
- dict_t *params;
- int32_t flags;
- mode_t mode;
inode_t *inode;
struct iatt buf;
struct iatt preparent;
struct iatt postparent;
- struct iatt read_child_buf;
+ struct iatt prenewparent;
+ struct iatt postnewparent;
+ } dir_fop; //common structure for all dir fops
+
+ struct {
+ fd_t *fd;
+ dict_t *params;
+ int32_t flags;
+ mode_t mode;
} create;
struct {
dev_t dev;
mode_t mode;
dict_t *params;
- inode_t *inode;
- struct iatt buf;
- struct iatt preparent;
- struct iatt postparent;
- struct iatt read_child_buf;
} mknod;
struct {
int32_t mode;
dict_t *params;
- inode_t *inode;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preparent;
- struct iatt postparent;
} mkdir;
struct {
- int32_t op_ret;
- int32_t op_errno;
- struct iatt preparent;
- struct iatt postparent;
- } unlink;
-
- struct {
- int flags;
- int32_t op_ret;
- int32_t op_errno;
- struct iatt preparent;
- struct iatt postparent;
+ int flags;
} rmdir;
struct {
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preoldparent;
- struct iatt prenewparent;
- struct iatt postoldparent;
- struct iatt postnewparent;
- } rename;
-
- struct {
- inode_t *inode;
- struct iatt buf;
- struct iatt read_child_buf;
- struct iatt preparent;
- struct iatt postparent;
- } link;
-
- struct {
- inode_t *inode;
dict_t *params;
- struct iatt buf;
- struct iatt read_child_buf;
char *linkpath;
- struct iatt preparent;
- struct iatt postparent;
} symlink;
+ struct {
+ int32_t mode;
+ off_t offset;
+ size_t len;
+ } fallocate;
+
+ struct {
+ off_t offset;
+ size_t len;
+ } discard;
+
struct {
- int32_t flags;
- dir_entry_t *entries;
- int32_t count;
- } setdents;
+ off_t offset;
+ size_t len;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } zerofill;
+
+
} cont;
struct {
off_t start, len;
+ gf_boolean_t eager_lock_on;
int *eager_lock;
char *basename;
@@ -677,12 +734,18 @@ typedef struct _afr_local {
afr_transaction_type type;
- int success_count;
- int erase_pending;
- int failure_count;
+ /* pre-compute the post piggyback status before
+ entering POST-OP phase
+ */
+ int *postop_piggybacked;
+
+ /* stub to resume on destruction
+ of the transaction frame */
+ call_stub_t *resume_stub;
+
+ struct list_head eager_locked;
- int last_tried;
- int32_t *child_errno;
+ int32_t **txn_changelog;//changelog after pre+post ops
unsigned char *pre_op;
call_frame_t *main_frame;
@@ -708,6 +771,8 @@ typedef struct _afr_local {
mode_t umask;
int xflag;
+ gf_boolean_t do_discovery;
+ struct afr_reply *replies;
} afr_local_t;
typedef enum {
@@ -717,11 +782,6 @@ typedef enum {
} afr_fd_open_status_t;
typedef struct {
- struct list_head call_list;
- call_frame_t *frame;
-} afr_fd_paused_call_t;
-
-typedef struct {
unsigned int *pre_op_done;
afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
unsigned int *pre_op_piggyback;
@@ -740,7 +800,20 @@ typedef struct {
struct list_head entries; /* needed for readdir failover */
unsigned char *locked_on; /* which subvolumes locks have been successful */
- struct list_head paused_calls; /* queued calls while fix_open happens */
+
+ /* used for delayed-post-op optimization */
+ pthread_mutex_t delay_lock;
+ gf_timer_t *delay_timer;
+ call_frame_t *delay_frame;
+ int call_child;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
+
+ /* list of frames currently in progress */
+ struct list_head eager_locked;
} afr_fd_ctx_t;
@@ -776,6 +849,13 @@ int32_t
afr_notify (xlator_t *this, int32_t event, void *data, void *data2);
int
+afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local,
+ loc_t *loc, char *basename, int child_count);
+
+void
+afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock);
+
+int
afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);
int
@@ -810,8 +890,8 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this);
int
afr_internal_lock_finish (call_frame_t *frame, xlator_t *this);
-void
-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src,
+int
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
unsigned int child_count);
int pump_start (call_frame_t *frame, xlator_t *this);
@@ -861,7 +941,8 @@ gf_boolean_t
afr_is_split_brain (xlator_t *this, inode_t *inode);
void
-afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set);
+afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb,
+ afr_spb_state_t data_spb);
int
afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
@@ -948,12 +1029,13 @@ afr_first_up_child (unsigned char *child_up, size_t child_count);
int
afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count,
int32_t prev_read_child,
- int32_t config_read_child, int32_t *sources);
+ int32_t config_read_child, int32_t *sources,
+ unsigned int hmode, uuid_t gfid);
void
afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,
int32_t *fresh_children, int32_t prev_read_child,
- int32_t config_read_child);
+ int32_t config_read_child, uuid_t gfid);
int32_t
afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child,
@@ -975,8 +1057,9 @@ afr_children_rm_child (int32_t *children, int32_t child,
int32_t child_count);
void
afr_reset_children (int32_t *children, int32_t child_count);
-gf_boolean_t
-afr_error_more_important (int32_t old_errno, int32_t new_errno);
+int32_t
+afr_most_important_error(int32_t old_errno, int32_t new_errno,
+ gf_boolean_t eio);
int
afr_errno_count (int32_t *children, int *child_errno,
unsigned int child_count, int32_t op_errno);
@@ -1019,11 +1102,11 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode,
int (*unwind) (call_frame_t *frame, xlator_t *this,
int32_t op_ret, int32_t op_errno,
int32_t sh_failed));
-int
-afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx,
- int need_open_count, int *need_open);
-int
-afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop);
+void
+afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open);
+
+void
+afr_open_fd_fix (fd_t *fd, xlator_t *this);
int
afr_set_elem_count_get (unsigned char *elems, int child_count);
@@ -1050,6 +1133,23 @@ afr_matrix_cleanup (int32_t **pending, unsigned int m);
int32_t**
afr_matrix_create (unsigned int m, unsigned int n);
+
+gf_boolean_t
+afr_is_errno_set (int *child_errno, int child);
+
+gf_boolean_t
+afr_is_errno_unset (int *child_errno, int child);
+
+gf_boolean_t
+afr_is_fd_fixable (fd_t *fd);
+
+void
+afr_prepare_new_entry_pending_matrix (int32_t **pending,
+ gf_boolean_t (*is_pending) (int *, int),
+ int *ctx, struct iatt *buf,
+ unsigned int child_count);
+void
+afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count);
/*
* Special value indicating we should use the "auto" quorum method instead of
* a fixed value (including zero to turn off quorum enforcement).
@@ -1069,4 +1169,44 @@ afr_matrix_create (unsigned int m, unsigned int n);
} \
} while (0);
+
+#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO."
+
+#define AFR_SBRAIN_CHECK_FD(fd, label) do { \
+ if (fd->inode && afr_is_split_brain (this, fd->inode)) { \
+ op_errno = EIO; \
+ gf_log (this->name, GF_LOG_WARNING, \
+ AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid)); \
+ goto label; \
+ } \
+} while (0)
+
+#define AFR_SBRAIN_CHECK_LOC(loc, label) do { \
+ if (loc->inode && afr_is_split_brain (this, loc->inode)) { \
+ op_errno = EIO; \
+ loc_path (loc, NULL); \
+ gf_log (this->name, GF_LOG_WARNING, \
+ AFR_SBRAIN_MSG , loc->path); \
+ goto label; \
+ } \
+} while (0)
+
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
+
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
+
+void
+afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
+
+int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
+
+void
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
+
+afr_inode_ctx_t*
+afr_inode_ctx_get (inode_t *inode, xlator_t *this);
+
#endif /* __AFR_H__ */