summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr-v1/src/afr.h
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/afr-v1/src/afr.h')
-rw-r--r--xlators/cluster/afr-v1/src/afr.h1215
1 files changed, 1215 insertions, 0 deletions
diff --git a/xlators/cluster/afr-v1/src/afr.h b/xlators/cluster/afr-v1/src/afr.h
new file mode 100644
index 000000000..9196a1f27
--- /dev/null
+++ b/xlators/cluster/afr-v1/src/afr.h
@@ -0,0 +1,1215 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef __AFR_H__
+#define __AFR_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "call-stub.h"
+#include "compat-errno.h"
+#include "afr-mem-types.h"
+#include "afr-self-heal-algorithm.h"
+
+#include "libxlator.h"
+#include "timer.h"
+
+#define AFR_XATTR_PREFIX "trusted.afr"
+#define AFR_PATHINFO_HEADER "REPLICATE:"
+#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
+#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
+
+#define AFR_LOCKEE_COUNT_MAX 3
+#define AFR_DOM_COUNT_MAX 3
+
+#define afr_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE)
+
+struct _pump_private;
+
+typedef int (*afr_expunge_done_cbk_t) (call_frame_t *frame, xlator_t *this,
+ int child, int32_t op_error,
+ int32_t op_errno);
+
+typedef int (*afr_impunge_done_cbk_t) (call_frame_t *frame, xlator_t *this,
+ int32_t op_error, int32_t op_errno);
+typedef int (*afr_post_remove_call_t) (call_frame_t *frame, xlator_t *this);
+
+typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this);
+typedef void (*afr_lookup_done_cbk_t) (call_frame_t *frame, xlator_t *this,
+ int32_t op_ret, int32_t op_errno);
+
+typedef enum {
+ AFR_POS_UNKNOWN,
+ AFR_POS_LOCAL,
+ AFR_POS_REMOTE
+} afr_child_pos_t;
+
+typedef enum {
+ SPLIT_BRAIN = 1,
+ ALL_FOOLS = 2
+} afr_subvol_status_t;
+
+typedef enum {
+ AFR_INODE_SET_READ_CTX = 1,
+ AFR_INODE_RM_STALE_CHILDREN,
+ AFR_INODE_SET_OPENDIR_DONE,
+ AFR_INODE_GET_READ_CTX,
+ AFR_INODE_GET_OPENDIR_DONE,
+} afr_inode_op_t;
+
+typedef struct afr_inode_params_ {
+ afr_inode_op_t op;
+ union {
+ gf_boolean_t value;
+ struct {
+ int32_t read_child;
+ int32_t *children;
+ } read_ctx;
+ } u;
+} afr_inode_params_t;
+
+typedef enum afr_spb_state {
+ DONT_KNOW,
+ SPB,
+ NO_SPB
+} afr_spb_state_t;
+
+typedef struct afr_inode_ctx_ {
+ uint64_t masks;
+ int32_t *fresh_children;//increasing order of latency
+ afr_spb_state_t mdata_spb;
+ afr_spb_state_t data_spb;
+ uint32_t open_fd_count;
+} afr_inode_ctx_t;
+
+typedef enum {
+ NONE,
+ INDEX,
+ INDEX_TO_BE_HEALED,
+ FULL,
+} afr_crawl_type_t;
+
+typedef struct afr_self_heald_ {
+ gf_boolean_t enabled;
+ gf_boolean_t iamshd;
+ afr_crawl_type_t *pending;
+ gf_boolean_t *inprogress;
+ afr_child_pos_t *pos;
+ gf_timer_t **timer;
+ eh_t *healed;
+ eh_t *heal_failed;
+ eh_t *split_brain;
+ eh_t **statistics;
+ void **crawl_events;
+ char *node_uuid;
+ int timeout;
+} afr_self_heald_t;
+
+typedef struct _afr_private {
+ gf_lock_t lock; /* to guard access to child_count, etc */
+ unsigned int child_count; /* total number of children */
+
+ unsigned int read_child_rr; /* round-robin index of the read_child */
+ gf_lock_t read_child_lock; /* lock to protect above */
+
+ xlator_t **children;
+
+ int first_lookup;
+ inode_t *root_inode;
+
+ unsigned char *child_up;
+
+ char **pending_key;
+
+ char *data_self_heal; /* on/off/open */
+ char * data_self_heal_algorithm; /* name of algorithm */
+ unsigned int data_self_heal_window_size; /* max number of pipelined
+ read/writes */
+
+ unsigned int background_self_heal_count;
+ unsigned int background_self_heals_started;
+ gf_boolean_t metadata_self_heal; /* on/off */
+ gf_boolean_t entry_self_heal; /* on/off */
+
+ gf_boolean_t data_change_log; /* on/off */
+ gf_boolean_t metadata_change_log; /* on/off */
+ gf_boolean_t entry_change_log; /* on/off */
+
+ int read_child; /* read-subvolume */
+ unsigned int hash_mode; /* for when read_child is not set */
+ int favorite_child; /* subvolume to be preferred in resolving
+ split-brain cases */
+
+ gf_boolean_t inodelk_trace;
+ gf_boolean_t entrylk_trace;
+
+ gf_boolean_t strict_readdir;
+
+ unsigned int wait_count; /* # of servers to wait for success */
+
+ uint64_t up_count; /* number of CHILD_UPs we have seen */
+ uint64_t down_count; /* number of CHILD_DOWNs we have seen */
+
+ struct _pump_private *pump_private; /* Set if we are loaded as pump */
+ int use_afr_in_pump;
+
+ pthread_mutex_t mutex;
+ struct list_head saved_fds; /* list of fds on which locks have succeeded */
+ gf_boolean_t optimistic_change_log;
+ gf_boolean_t eager_lock;
+ uint32_t post_op_delay_secs;
+ unsigned int quorum_count;
+
+ char vol_uuid[UUID_SIZE + 1];
+ int32_t *last_event;
+ afr_self_heald_t shd;
+ gf_boolean_t choose_local;
+ gf_boolean_t did_discovery;
+ gf_boolean_t readdir_failover;
+ uint64_t sh_readdir_size;
+ gf_boolean_t ensure_durability;
+ char *sh_domain;
+} afr_private_t;
+
+typedef enum {
+ AFR_SELF_HEAL_NOT_ATTEMPTED,
+ AFR_SELF_HEAL_STARTED,
+ AFR_SELF_HEAL_FAILED,
+ AFR_SELF_HEAL_SYNC_BEGIN,
+} afr_self_heal_status;
+
+typedef struct {
+ afr_self_heal_status gfid_or_missing_entry_self_heal;
+ afr_self_heal_status metadata_self_heal;
+ afr_self_heal_status data_self_heal;
+ afr_self_heal_status entry_self_heal;
+} afr_sh_status_for_all_type;
+
+typedef enum {
+ AFR_SELF_HEAL_ENTRY,
+ AFR_SELF_HEAL_METADATA,
+ AFR_SELF_HEAL_DATA,
+ AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY,
+ AFR_SELF_HEAL_INVALID = -1,
+} afr_self_heal_type;
+
+typedef enum {
+ AFR_CHECK_ALL,
+ AFR_CHECK_SPECIFIC,
+} afr_sh_fail_check_type;
+
+struct afr_self_heal_ {
+ /* External interface: These are variables (some optional) that
+ are set by whoever has triggered self-heal */
+
+ gf_boolean_t do_data_self_heal;
+ gf_boolean_t do_metadata_self_heal;
+ gf_boolean_t do_entry_self_heal;
+ gf_boolean_t do_gfid_self_heal;
+ gf_boolean_t do_missing_entry_self_heal;
+ gf_boolean_t force_confirm_spb; /* Check for split-brains even when
+ self-heal is turned off */
+
+ gf_boolean_t forced_merge; /* Is this a self-heal triggered to
+ forcibly merge the directories? */
+
+ gf_boolean_t background; /* do self-heal in background
+ if possible */
+ ia_type_t type; /* st_mode of the entry we're doing
+ self-heal on */
+ inode_t *inode; /* inode on which the self-heal is
+ performed on */
+ uuid_t sh_gfid_req; /* gfid self-heal needs to be done
+ with this gfid if it is not null */
+
+ /* Function to call to unwind. If self-heal is being done in the
+ background, this function will be called as soon as possible. */
+
+ int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, int32_t sh_failed);
+
+ /* End of external interface members */
+
+
+ /* array of stat's, one for each child */
+ struct iatt *buf;
+ struct iatt *parentbufs;
+ struct iatt parentbuf;
+ struct iatt entrybuf;
+
+ afr_expunge_done_cbk_t expunge_done;
+ afr_impunge_done_cbk_t impunge_done;
+
+ /* array of xattr's, one for each child */
+ dict_t **xattr;
+
+ /* array containing if the lookups succeeded in the order of response
+ */
+ int32_t *success_children;
+ int success_count;
+ /* array containing the fresh children found in the self-heal process */
+ int32_t *fresh_children;
+ /* array containing the fresh children found in the parent lookup */
+ int32_t *fresh_parent_dirs;
+ /* array of errno's, one for each child */
+ int *child_errno;
+ /*loc used for lookup*/
+ loc_t lookup_loc;
+ int32_t lookup_flags;
+ afr_lookup_done_cbk_t lookup_done;
+
+ int32_t **pending_matrix;
+ int32_t **delta_matrix;
+
+ int32_t op_ret;
+ int32_t op_errno;
+
+ int *sources;
+ int source;
+ int active_source;
+ int active_sinks;
+ unsigned char *success;
+ unsigned char *locked_nodes;
+ int lock_count;
+
+ const char *linkname;
+ gf_boolean_t entries_skipped;
+
+ gf_boolean_t actual_sh_started;
+ gf_boolean_t sync_done;
+ gf_boolean_t data_lock_held;
+ gf_boolean_t sh_dom_lock_held;
+ gf_boolean_t eof_reached;
+ fd_t *healing_fd;
+ int file_has_holes;
+ blksize_t block_size;
+ off_t file_size;
+ off_t offset;
+ unsigned char *write_needed;
+ uint8_t *checksum;
+ afr_post_remove_call_t post_remove_call;
+
+ char *data_sh_info;
+ char *metadata_sh_info;
+
+ loc_t parent_loc;
+ call_frame_t *orig_frame;
+ call_frame_t *old_loop_frame;
+ gf_boolean_t unwound;
+
+ afr_sh_algo_private_t *private;
+ afr_sh_status_for_all_type afr_all_sh_status;
+ afr_self_heal_type sh_type_in_action;
+
+ struct afr_sh_algorithm *algo;
+ afr_lock_cbk_t data_lock_success_handler;
+ afr_lock_cbk_t data_lock_failure_handler;
+ gf_boolean_t data_lock_block;
+ int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
+ int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this);
+ int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this);
+ int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this);
+ void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this);
+
+ call_frame_t *sh_frame;
+};
+
+typedef struct afr_self_heal_ afr_self_heal_t;
+
+typedef enum {
+ AFR_DATA_TRANSACTION, /* truncate, write, ... */
+ AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
+ AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
+ AFR_ENTRY_RENAME_TRANSACTION, /* rename */
+} afr_transaction_type;
+
+typedef enum {
+ AFR_TRANSACTION_LK,
+ AFR_SELFHEAL_LK,
+} transaction_lk_type_t;
+
+typedef enum {
+ AFR_LOCK_OP,
+ AFR_UNLOCK_OP,
+} afr_lock_op_type_t;
+
+typedef enum {
+ AFR_DATA_SELF_HEAL_LK,
+ AFR_METADATA_SELF_HEAL_LK,
+ AFR_ENTRY_SELF_HEAL_LK,
+}selfheal_lk_type_t;
+
+typedef enum {
+ AFR_INODELK_TRANSACTION,
+ AFR_INODELK_NB_TRANSACTION,
+ AFR_ENTRYLK_TRANSACTION,
+ AFR_ENTRYLK_NB_TRANSACTION,
+ AFR_INODELK_SELFHEAL,
+ AFR_INODELK_NB_SELFHEAL,
+ AFR_ENTRYLK_SELFHEAL,
+ AFR_ENTRYLK_NB_SELFHEAL,
+} afr_lock_call_type_t;
+
+/*
+ xattr format: trusted.afr.volume = [x y z]
+ x - data pending
+ y - metadata pending
+ z - entry pending
+*/
+
+static inline int
+afr_index_for_transaction_type (afr_transaction_type type)
+{
+ switch (type) {
+
+ case AFR_DATA_TRANSACTION:
+ return 0;
+
+ case AFR_METADATA_TRANSACTION:
+ return 1;
+
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ return 2;
+ }
+
+ return -1; /* make gcc happy */
+}
+
+typedef struct {
+ loc_t loc;
+ char *basename;
+ unsigned char *locked_nodes;
+ int locked_count;
+
+} afr_entry_lockee_t;
+
+int
+afr_entry_lockee_cmp (const void *l1, const void *l2);
+
+typedef struct {
+ char *domain; /* Domain on which inodelk is taken */
+ struct gf_flock flock;
+ unsigned char *locked_nodes;
+ int32_t lock_count;
+} afr_inodelk_t;
+
+typedef struct {
+ loc_t *lk_loc;
+
+ int lockee_count;
+ afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
+
+ afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX];
+ const char *lk_basename;
+ const char *lower_basename;
+ const char *higher_basename;
+ char lower_locked;
+ char higher_locked;
+
+ unsigned char *locked_nodes;
+ unsigned char *lower_locked_nodes;
+
+ selfheal_lk_type_t selfheal_lk_type;
+ transaction_lk_type_t transaction_lk_type;
+
+ int32_t lock_count;
+ int32_t entrylk_lock_count;
+
+ uint64_t lock_number;
+ int32_t lk_call_count;
+ int32_t lk_expected_count;
+ int32_t lk_attempted_count;
+
+ int32_t lock_op_ret;
+ int32_t lock_op_errno;
+ afr_lock_cbk_t lock_cbk;
+ char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
+} afr_internal_lock_t;
+
+typedef struct _afr_locked_fd {
+ fd_t *fd;
+ struct list_head list;
+} afr_locked_fd_t;
+
+struct afr_reply {
+ int valid;
+ int32_t op_ret;
+ int32_t op_errno;
+};
+
+typedef struct _afr_local {
+ int uid;
+ int gid;
+ unsigned int call_count;
+ unsigned int success_count;
+ unsigned int enoent_count;
+ uint32_t open_fd_count;
+ gf_boolean_t update_open_fd_count;
+
+
+ unsigned int unhealable;
+
+ unsigned int read_child_index;
+ unsigned char read_child_returned;
+ unsigned int first_up_child;
+
+ gf_lkowner_t saved_lk_owner;
+
+ int32_t op_ret;
+ int32_t op_errno;
+
+ int32_t **pending;
+
+ loc_t loc;
+ loc_t newloc;
+
+ fd_t *fd;
+
+ glusterfs_fop_t fop;
+
+ unsigned char *child_up;
+ int32_t *fresh_children; //in the order of response
+
+ int32_t *child_errno;
+
+ dict_t *xattr_req;
+
+ int32_t inodelk_count;
+ int32_t entrylk_count;
+
+ afr_internal_lock_t internal_lock;
+
+ afr_locked_fd_t *locked_fd;
+ int32_t source_child;
+ int32_t lock_recovery_child;
+
+ dict_t *dict;
+ int optimistic_change_log;
+ gf_boolean_t delayed_post_op;
+
+
+ /* Is the current writev() going to perform a stable write?
+ i.e, is fd->flags or @flags writev param have O_SYNC or
+ O_DSYNC?
+ */
+ gf_boolean_t stable_write;
+
+ /* This write appended to the file. Nnot necessarily O_APPEND,
+ just means the offset of write was at the end of file.
+ */
+ gf_boolean_t append_write;
+
+ int attempt_self_heal;
+ int foreground_self_heal;
+
+
+ /* This struct contains the arguments for the "continuation"
+ (scheme-like) of fops
+ */
+
+ int op;
+ struct {
+ struct {
+ unsigned char buf_set;
+ struct statvfs buf;
+ } statfs;
+
+ struct {
+ uint32_t parent_entrylk;
+ uuid_t gfid_req;
+ inode_t *inode;
+ struct iatt buf;
+ struct iatt postparent;
+ dict_t **xattrs;
+ dict_t *xattr;
+ struct iatt *postparents;
+ struct iatt *bufs;
+ int32_t read_child;
+ int32_t *sources;
+ int32_t *success_children;
+ int32_t **pending_matrix;
+ gf_boolean_t fresh_lookup;
+ gf_boolean_t possible_spb;
+ } lookup;
+
+ struct {
+ int32_t flags;
+ } open;
+
+ struct {
+ int32_t cmd;
+ struct gf_flock user_flock;
+ struct gf_flock ret_flock;
+ unsigned char *locked_nodes;
+ } lk;
+
+ /* inode read */
+
+ struct {
+ int32_t mask;
+ int last_index; /* index of the child we tried previously */
+ } access;
+
+ struct {
+ int last_index;
+ } stat;
+
+ struct {
+ int last_index;
+ } fstat;
+
+ struct {
+ size_t size;
+ int last_index;
+ } readlink;
+
+ struct {
+ char *name;
+ int last_index;
+ long xattr_len;
+ } getxattr;
+
+ struct {
+ size_t size;
+ off_t offset;
+ int last_index;
+ uint32_t flags;
+ } readv;
+
+ /* dir read */
+
+ struct {
+ int success_count;
+ int32_t op_ret;
+ int32_t op_errno;
+
+ uint32_t *checksum;
+ } opendir;
+
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ size_t size;
+ off_t offset;
+ dict_t *dict;
+ gf_boolean_t failed;
+ int last_index;
+ } readdir;
+ /* inode write */
+
+ struct {
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } inode_wfop; //common structure for all inode-write-fops
+
+ struct {
+ int32_t op_ret;
+
+ struct iovec *vector;
+ struct iobref *iobref;
+ int32_t count;
+ off_t offset;
+ uint32_t flags;
+ } writev;
+
+ struct {
+ off_t offset;
+ } truncate;
+
+ struct {
+ off_t offset;
+ } ftruncate;
+
+ struct {
+ struct iatt in_buf;
+ int32_t valid;
+ } setattr;
+
+ struct {
+ struct iatt in_buf;
+ int32_t valid;
+ } fsetattr;
+
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } setxattr;
+
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } fsetxattr;
+
+ struct {
+ char *name;
+ } removexattr;
+
+ struct {
+ dict_t *xattr;
+ } xattrop;
+
+ struct {
+ dict_t *xattr;
+ } fxattrop;
+
+ /* dir write */
+
+ struct {
+ inode_t *inode;
+ struct iatt buf;
+ struct iatt preparent;
+ struct iatt postparent;
+ struct iatt prenewparent;
+ struct iatt postnewparent;
+ } dir_fop; //common structure for all dir fops
+
+ struct {
+ fd_t *fd;
+ dict_t *params;
+ int32_t flags;
+ mode_t mode;
+ } create;
+
+ struct {
+ dev_t dev;
+ mode_t mode;
+ dict_t *params;
+ } mknod;
+
+ struct {
+ int32_t mode;
+ dict_t *params;
+ } mkdir;
+
+ struct {
+ int flags;
+ } rmdir;
+
+ struct {
+ dict_t *params;
+ char *linkpath;
+ } symlink;
+
+ struct {
+ int32_t mode;
+ off_t offset;
+ size_t len;
+ } fallocate;
+
+ struct {
+ off_t offset;
+ size_t len;
+ } discard;
+
+ struct {
+ off_t offset;
+ off_t len;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } zerofill;
+
+
+ } cont;
+
+ struct {
+ off_t start, len;
+
+ gf_boolean_t eager_lock_on;
+ int *eager_lock;
+
+ char *basename;
+ char *new_basename;
+
+ loc_t parent_loc;
+ loc_t new_parent_loc;
+
+ afr_transaction_type type;
+
+ /* pre-compute the post piggyback status before
+ entering POST-OP phase
+ */
+ int *postop_piggybacked;
+
+ /* stub to resume on destruction
+ of the transaction frame */
+ call_stub_t *resume_stub;
+
+ struct list_head eager_locked;
+
+ int32_t **txn_changelog;//changelog after pre+post ops
+ unsigned char *pre_op;
+
+ call_frame_t *main_frame;
+
+ int (*fop) (call_frame_t *frame, xlator_t *this);
+
+ int (*done) (call_frame_t *frame, xlator_t *this);
+
+ int (*resume) (call_frame_t *frame, xlator_t *this);
+
+ int (*unwind) (call_frame_t *frame, xlator_t *this);
+
+ /* post-op hook */
+ } transaction;
+
+ afr_self_heal_t self_heal;
+
+ struct marker_str marker;
+
+ /* extra data for fops */
+ dict_t *xdata_req;
+ dict_t *xdata_rsp;
+
+ mode_t umask;
+ int xflag;
+ gf_boolean_t do_discovery;
+ struct afr_reply *replies;
+} afr_local_t;
+
+typedef enum {
+ AFR_FD_NOT_OPENED,
+ AFR_FD_OPENED,
+ AFR_FD_OPENING
+} afr_fd_open_status_t;
+
+typedef struct {
+ unsigned int *pre_op_done;
+ afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
+ unsigned int *pre_op_piggyback;
+
+ unsigned int *lock_piggyback;
+ unsigned int *lock_acquired;
+
+ int flags;
+ uint64_t up_count; /* number of CHILD_UPs this fd has seen */
+ uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */
+
+ int32_t last_tried;
+
+ int hit, miss;
+ gf_boolean_t failed_over;
+ struct list_head entries; /* needed for readdir failover */
+
+ unsigned char *locked_on; /* which subvolumes locks have been successful */
+
+ /* used for delayed-post-op optimization */
+ pthread_mutex_t delay_lock;
+ gf_timer_t *delay_timer;
+ call_frame_t *delay_frame;
+ int call_child;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
+
+ /* list of frames currently in progress */
+ struct list_head eager_locked;
+} afr_fd_ctx_t;
+
+
+/* try alloc and if it fails, goto label */
+#define AFR_LOCAL_ALLOC_OR_GOTO(var, label) do { \
+ var = mem_get0 (THIS->local_pool); \
+ if (!var) { \
+ gf_log (this->name, GF_LOG_ERROR, \
+ "out of memory :("); \
+ op_errno = ENOMEM; \
+ goto label; \
+ } \
+ } while (0);
+
+
+/* did a call fail due to a child failing? */
+#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
+ ((op_errno == ENOTCONN) || \
+ (op_errno == EBADFD)))
+
+#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1)
+
+/* have we tried all children? */
+#define all_tried(i, count) ((i) == (count) - 1)
+
+int32_t
+afr_set_dict_gfid (dict_t *dict, uuid_t gfid);
+
+int
+pump_command_reply (call_frame_t *frame, xlator_t *this);
+
+int32_t
+afr_notify (xlator_t *this, int32_t event, void *data, void *data2);
+
+int
+afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local,
+ loc_t *loc, char *basename, int child_count);
+
+void
+afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock);
+
+int
+afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);
+
+int
+afr_save_locked_fd (xlator_t *this, fd_t *fd);
+
+int
+afr_mark_locked_nodes (xlator_t *this, fd_t *fd,
+ unsigned char *locked_nodes);
+
+void
+afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner);
+
+int
+afr_set_lock_number (call_frame_t *frame, xlator_t *this);
+
+
+loc_t *
+lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2);
+
+int32_t
+afr_unlock (call_frame_t *frame, xlator_t *this);
+
+int
+afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this);
+
+int
+afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this);
+
+int
+afr_blocking_lock (call_frame_t *frame, xlator_t *this);
+
+int
+afr_internal_lock_finish (call_frame_t *frame, xlator_t *this);
+
+int
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
+ unsigned int child_count);
+
+int pump_start (call_frame_t *frame, xlator_t *this);
+
+int
+__afr_fd_ctx_set (xlator_t *this, fd_t *fd);
+
+int
+afr_fd_ctx_set (xlator_t *this, fd_t *fd);
+
+int32_t
+afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children);
+
+void
+afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child,
+ int32_t *fresh_children);
+
+int
+afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno);
+
+unsigned int
+afr_up_children_count (unsigned char *child_up, unsigned int child_count);
+
+unsigned int
+afr_locked_children_count (unsigned char *children, unsigned int child_count);
+
+unsigned int
+afr_pre_op_done_children_count (unsigned char *pre_op,
+ unsigned int child_count);
+
+gf_boolean_t
+afr_is_fresh_lookup (loc_t *loc, xlator_t *this);
+
+void
+afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent);
+
+int
+afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
+
+void
+afr_local_cleanup (afr_local_t *local, xlator_t *this);
+
+int
+afr_frame_return (call_frame_t *frame);
+
+gf_boolean_t
+afr_is_split_brain (xlator_t *this, inode_t *inode);
+
+void
+afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb,
+ afr_spb_state_t data_spb);
+
+int
+afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata);
+
+void
+afr_set_opendir_done (xlator_t *this, inode_t *inode);
+
+gf_boolean_t
+afr_is_opendir_done (xlator_t *this, inode_t *inode);
+
+void
+afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
+
+int
+afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
+
+int
+afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd);
+
+#define AFR_STACK_UNWIND(fop, frame, params ...) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ if (frame) { \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ } \
+ STACK_UNWIND_STRICT (fop, frame, params); \
+ if (__local) { \
+ afr_local_cleanup (__local, __this); \
+ mem_put (__local); \
+ } \
+ } while (0)
+
+#define AFR_STACK_DESTROY(frame) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ STACK_DESTROY (frame->root); \
+ if (__local) { \
+ afr_local_cleanup (__local, __this); \
+ mem_put (__local); \
+ } \
+ } while (0);
+
+#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
+/* allocate and return a string that is the basename of argument */
+static inline char *
+AFR_BASENAME (const char *str)
+{
+ char *__tmp_str = NULL;
+ char *__basename_str = NULL;
+ __tmp_str = gf_strdup (str);
+ __basename_str = gf_strdup (basename (__tmp_str));
+ GF_FREE (__tmp_str);
+ return __basename_str;
+}
+
+int
+afr_transaction_local_init (afr_local_t *local, xlator_t *this);
+
+int32_t
+afr_marker_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv );
+
+int32_t *
+afr_children_create (int32_t child_count);
+
+int
+afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno);
+
+int
+afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,
+ transaction_lk_type_t lk_type);
+
+int
+afr_first_up_child (unsigned char *child_up, size_t child_count);
+
+int
+afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count,
+ int32_t prev_read_child,
+ int32_t config_read_child, int32_t *sources,
+ unsigned int hmode, uuid_t gfid);
+
+void
+afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,
+ int32_t *fresh_children, int32_t prev_read_child,
+ int32_t config_read_child, uuid_t gfid);
+
+int32_t
+afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child,
+ int32_t *fresh_children,
+ int32_t *call_child, int32_t *last_index);
+
+int32_t
+afr_next_call_child (int32_t *fresh_children, unsigned char *child_up,
+ size_t child_count, int32_t *last_index,
+ int32_t read_child);
+void
+afr_get_fresh_children (int32_t *success_children, int32_t *sources,
+ int32_t *children, unsigned int child_count);
+void
+afr_children_add_child (int32_t *children, int32_t child,
+ int32_t child_count);
+void
+afr_children_rm_child (int32_t *children, int32_t child,
+ int32_t child_count);
+void
+afr_reset_children (int32_t *children, int32_t child_count);
+int32_t
+afr_most_important_error(int32_t old_errno, int32_t new_errno,
+ gf_boolean_t eio);
+int
+afr_errno_count (int32_t *children, int *child_errno,
+ unsigned int child_count, int32_t op_errno);
+int
+afr_get_children_count (int32_t *children, unsigned int child_count);
+gf_boolean_t
+afr_is_child_present (int32_t *success_children, int32_t child_count,
+ int32_t child);
+void
+afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs,
+ int32_t *success_children,
+ unsigned int child_count);
+void
+afr_reset_xattr (dict_t **xattr, unsigned int child_count);
+gf_boolean_t
+afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children,
+ unsigned int child_count, const char *path,
+ const char *xlator_name);
+unsigned int
+afr_gfid_missing_count (const char *xlator_name, int32_t *children,
+ struct iatt *bufs, unsigned int child_count,
+ const char *path);
+void
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path);
+void
+afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count);
+afr_transaction_type
+afr_transaction_type_get (ia_type_t ia_type);
+int32_t
+afr_resultant_errno_get (int32_t *children,
+ int *child_errno, unsigned int child_count);
+void
+afr_inode_rm_stale_children (xlator_t *this, inode_t *inode,
+ int32_t *stale_children);
+void
+afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ gf_boolean_t background, ia_type_t ia_type, char *reason,
+ void (*gfid_sh_success_cbk) (call_frame_t *sh_frame,
+ xlator_t *this),
+ int (*unwind) (call_frame_t *frame, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ int32_t sh_failed));
+void
+afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open);
+
+void
+afr_open_fd_fix (fd_t *fd, xlator_t *this);
+int
+afr_set_elem_count_get (unsigned char *elems, int child_count);
+
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this);
+
+gf_boolean_t
+afr_open_only_data_self_heal (char *data_self_heal);
+
+gf_boolean_t
+afr_data_self_heal_enabled (char *data_self_heal);
+
+void
+afr_set_low_priority (call_frame_t *frame);
+int
+afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child,
+ int flags);
+
+gf_boolean_t
+afr_have_quorum (char *logname, afr_private_t *priv);
+
+void
+afr_matrix_cleanup (int32_t **pending, unsigned int m);
+
+int32_t**
+afr_matrix_create (unsigned int m, unsigned int n);
+
+gf_boolean_t
+afr_is_errno_set (int *child_errno, int child);
+
+gf_boolean_t
+afr_is_errno_unset (int *child_errno, int child);
+
+gf_boolean_t
+afr_is_fd_fixable (fd_t *fd);
+
+void
+afr_prepare_new_entry_pending_matrix (int32_t **pending,
+ gf_boolean_t (*is_pending) (int *, int),
+ int *ctx, struct iatt *buf,
+ unsigned int child_count);
+void
+afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count);
+/*
+ * Special value indicating we should use the "auto" quorum method instead of
+ * a fixed value (including zero to turn off quorum enforcement).
+ */
+#define AFR_QUORUM_AUTO INT_MAX
+
+/*
+ * Having this as a macro will make debugging a bit weirder, but does reduce
+ * the probability of functions handling this check inconsistently.
+ */
+#define QUORUM_CHECK(_func,_label) do { \
+ if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \
+ gf_log(this->name,GF_LOG_WARNING, \
+ "failing "#_func" due to lack of quorum"); \
+ op_errno = EROFS; \
+ goto _label; \
+ } \
+} while (0);
+
+
+#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO."
+
+#define AFR_SBRAIN_CHECK_FD(fd, label) do { \
+ if (fd->inode && afr_is_split_brain (this, fd->inode)) { \
+ op_errno = EIO; \
+ gf_log (this->name, GF_LOG_WARNING, \
+ AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid)); \
+ goto label; \
+ } \
+} while (0)
+
+#define AFR_SBRAIN_CHECK_LOC(loc, label) do { \
+ if (loc->inode && afr_is_split_brain (this, loc->inode)) { \
+ op_errno = EIO; \
+ loc_path (loc, NULL); \
+ gf_log (this->name, GF_LOG_WARNING, \
+ AFR_SBRAIN_MSG , loc->path); \
+ goto label; \
+ } \
+} while (0)
+
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
+
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
+
+void
+afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
+
+int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
+
+void
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
+
+afr_inode_ctx_t*
+afr_inode_ctx_get (inode_t *inode, xlator_t *this);
+
+#endif /* __AFR_H__ */