From 6d3739292b7b51d2ddbab75b5f884fb38925b943 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Thu, 16 Jan 2014 16:14:36 -0800 Subject: cluster/afr: refactor - Remove client side self-healing completely (opendir, openfd, lookup) - Re-work readdir-failover to work reliably in case of NFS - Remove unused/dead lock recovery code - Consistently use xdata in both calls and callbacks in all FOPs - Per-inode event generation, used to force inode ctx refresh - Implement dirty flag support (in place of pending counts) - Eliminate inode ctx structure, use read subvol bits + event_generation - Implement inode ctx refreshing based on event generation - Provide backward compatibility in transactions - remove unused variables and functions - make code more consistent in style and pattern - regularize and clean up inode-write transaction code - regularize and clean up dir-write transaction code - regularize and clean up common FOPs - reorganize transaction framework code - skip setting xattrs in pending dict if nothing is pending - re-write self-healing code using syncops - re-write simpler self-heal-daemon Change-Id: I1e4080c9796c8a2815c2dab4be3073f389d614a8 BUG: 1021686 Signed-off-by: Anand Avati Reviewed-on: http://review.gluster.org/6010 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/cluster/afr/src/afr.h | 823 +++++++++++++++--------------------------- 1 file changed, 291 insertions(+), 532 deletions(-) (limited to 'xlators/cluster/afr/src/afr.h') diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 9196a1f27..2e1b78d1c 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -20,112 +20,42 @@ #include "call-stub.h" #include "compat-errno.h" #include "afr-mem-types.h" -#include "afr-self-heal-algorithm.h" #include "libxlator.h" #include "timer.h" +#include "syncop.h" + +#include "afr-self-heald.h" #define AFR_XATTR_PREFIX "trusted.afr" #define AFR_PATHINFO_HEADER "REPLICATE:" #define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" #define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" +#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty" +#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty) #define AFR_LOCKEE_COUNT_MAX 3 #define AFR_DOM_COUNT_MAX 3 - -#define afr_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE) - -struct _pump_private; - -typedef int (*afr_expunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int child, int32_t op_error, - int32_t op_errno); - -typedef int (*afr_impunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int32_t op_error, int32_t op_errno); -typedef int (*afr_post_remove_call_t) (call_frame_t *frame, xlator_t *this); +#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); -typedef void (*afr_lookup_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno); -typedef enum { - AFR_POS_UNKNOWN, - AFR_POS_LOCAL, - AFR_POS_REMOTE -} afr_child_pos_t; +typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol); -typedef enum { - SPLIT_BRAIN = 1, - ALL_FOOLS = 2 -} afr_subvol_status_t; +typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int err); -typedef enum { - AFR_INODE_SET_READ_CTX = 1, - AFR_INODE_RM_STALE_CHILDREN, - AFR_INODE_SET_OPENDIR_DONE, - AFR_INODE_GET_READ_CTX, - AFR_INODE_GET_OPENDIR_DONE, -} afr_inode_op_t; - -typedef struct afr_inode_params_ { - afr_inode_op_t op; - union { - gf_boolean_t value; - struct { - int32_t read_child; - int32_t *children; - } read_ctx; - } u; -} afr_inode_params_t; - -typedef enum afr_spb_state { - DONT_KNOW, - SPB, - NO_SPB -} afr_spb_state_t; - -typedef struct afr_inode_ctx_ { - uint64_t masks; - int32_t *fresh_children;//increasing order of latency - afr_spb_state_t mdata_spb; - afr_spb_state_t data_spb; - uint32_t open_fd_count; -} afr_inode_ctx_t; +typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this); -typedef enum { - NONE, - INDEX, - INDEX_TO_BE_HEALED, - FULL, -} afr_crawl_type_t; - -typedef struct afr_self_heald_ { - gf_boolean_t enabled; - gf_boolean_t iamshd; - afr_crawl_type_t *pending; - gf_boolean_t *inprogress; - afr_child_pos_t *pos; - gf_timer_t **timer; - eh_t *healed; - eh_t *heal_failed; - eh_t *split_brain; - eh_t **statistics; - void **crawl_events; - char *node_uuid; - int timeout; -} afr_self_heald_t; +#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;}) +#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;}) +#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];}) typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ - unsigned int read_child_rr; /* round-robin index of the read_child */ - gf_lock_t read_child_lock; /* lock to protect above */ - xlator_t **children; - int first_lookup; inode_t *root_inode; unsigned char *child_up; @@ -146,6 +76,7 @@ typedef struct _afr_private { gf_boolean_t metadata_change_log; /* on/off */ gf_boolean_t entry_change_log; /* on/off */ + gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ int read_child; /* read-subvolume */ unsigned int hash_mode; /* for when read_child is not set */ int favorite_child; /* subvolume to be preferred in resolving @@ -154,178 +85,45 @@ typedef struct _afr_private { gf_boolean_t inodelk_trace; gf_boolean_t entrylk_trace; - gf_boolean_t strict_readdir; - unsigned int wait_count; /* # of servers to wait for success */ uint64_t up_count; /* number of CHILD_UPs we have seen */ uint64_t down_count; /* number of CHILD_DOWNs we have seen */ - struct _pump_private *pump_private; /* Set if we are loaded as pump */ - int use_afr_in_pump; - - pthread_mutex_t mutex; - struct list_head saved_fds; /* list of fds on which locks have succeeded */ gf_boolean_t optimistic_change_log; gf_boolean_t eager_lock; + gf_boolean_t pre_op_compat; /* on/off */ uint32_t post_op_delay_secs; unsigned int quorum_count; char vol_uuid[UUID_SIZE + 1]; int32_t *last_event; - afr_self_heald_t shd; + + /* @event_generation: Keeps count of number of events received which can + potentially impact consistency decisions. The events are CHILD_UP + and CHILD_DOWN, when we have to recalculate the freshness/staleness + of copies to detect if changes had happened while the other server + was down. CHILD_DOWN and CHILD_UP can also be received on network + disconnect/reconnects and not necessarily server going down/up. + Recalculating freshness/staleness on network events is equally + important as we might have had a network split brain. + */ + uint32_t event_generation; + gf_boolean_t choose_local; gf_boolean_t did_discovery; - gf_boolean_t readdir_failover; uint64_t sh_readdir_size; gf_boolean_t ensure_durability; char *sh_domain; -} afr_private_t; - -typedef enum { - AFR_SELF_HEAL_NOT_ATTEMPTED, - AFR_SELF_HEAL_STARTED, - AFR_SELF_HEAL_FAILED, - AFR_SELF_HEAL_SYNC_BEGIN, -} afr_self_heal_status; - -typedef struct { - afr_self_heal_status gfid_or_missing_entry_self_heal; - afr_self_heal_status metadata_self_heal; - afr_self_heal_status data_self_heal; - afr_self_heal_status entry_self_heal; -} afr_sh_status_for_all_type; - -typedef enum { - AFR_SELF_HEAL_ENTRY, - AFR_SELF_HEAL_METADATA, - AFR_SELF_HEAL_DATA, - AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY, - AFR_SELF_HEAL_INVALID = -1, -} afr_self_heal_type; - -typedef enum { - AFR_CHECK_ALL, - AFR_CHECK_SPECIFIC, -} afr_sh_fail_check_type; - -struct afr_self_heal_ { - /* External interface: These are variables (some optional) that - are set by whoever has triggered self-heal */ - - gf_boolean_t do_data_self_heal; - gf_boolean_t do_metadata_self_heal; - gf_boolean_t do_entry_self_heal; - gf_boolean_t do_gfid_self_heal; - gf_boolean_t do_missing_entry_self_heal; - gf_boolean_t force_confirm_spb; /* Check for split-brains even when - self-heal is turned off */ - - gf_boolean_t forced_merge; /* Is this a self-heal triggered to - forcibly merge the directories? */ - - gf_boolean_t background; /* do self-heal in background - if possible */ - ia_type_t type; /* st_mode of the entry we're doing - self-heal on */ - inode_t *inode; /* inode on which the self-heal is - performed on */ - uuid_t sh_gfid_req; /* gfid self-heal needs to be done - with this gfid if it is not null */ - - /* Function to call to unwind. If self-heal is being done in the - background, this function will be called as soon as possible. */ - - int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno, int32_t sh_failed); - - /* End of external interface members */ - - - /* array of stat's, one for each child */ - struct iatt *buf; - struct iatt *parentbufs; - struct iatt parentbuf; - struct iatt entrybuf; - - afr_expunge_done_cbk_t expunge_done; - afr_impunge_done_cbk_t impunge_done; - - /* array of xattr's, one for each child */ - dict_t **xattr; - - /* array containing if the lookups succeeded in the order of response - */ - int32_t *success_children; - int success_count; - /* array containing the fresh children found in the self-heal process */ - int32_t *fresh_children; - /* array containing the fresh children found in the parent lookup */ - int32_t *fresh_parent_dirs; - /* array of errno's, one for each child */ - int *child_errno; - /*loc used for lookup*/ - loc_t lookup_loc; - int32_t lookup_flags; - afr_lookup_done_cbk_t lookup_done; - - int32_t **pending_matrix; - int32_t **delta_matrix; + char *afr_dirty; - int32_t op_ret; - int32_t op_errno; + afr_self_heald_t shd; - int *sources; - int source; - int active_source; - int active_sinks; - unsigned char *success; - unsigned char *locked_nodes; - int lock_count; - - const char *linkname; - gf_boolean_t entries_skipped; - - gf_boolean_t actual_sh_started; - gf_boolean_t sync_done; - gf_boolean_t data_lock_held; - gf_boolean_t sh_dom_lock_held; - gf_boolean_t eof_reached; - fd_t *healing_fd; - int file_has_holes; - blksize_t block_size; - off_t file_size; - off_t offset; - unsigned char *write_needed; - uint8_t *checksum; - afr_post_remove_call_t post_remove_call; - - char *data_sh_info; - char *metadata_sh_info; - - loc_t parent_loc; - call_frame_t *orig_frame; - call_frame_t *old_loop_frame; - gf_boolean_t unwound; - - afr_sh_algo_private_t *private; - afr_sh_status_for_all_type afr_all_sh_status; - afr_self_heal_type sh_type_in_action; - - struct afr_sh_algorithm *algo; - afr_lock_cbk_t data_lock_success_handler; - afr_lock_cbk_t data_lock_failure_handler; - gf_boolean_t data_lock_block; - int (*completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this); - int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); - - call_frame_t *sh_frame; -}; + /* pump dependencies */ + void *pump_private; + gf_boolean_t use_afr_in_pump; +} afr_private_t; -typedef struct afr_self_heal_ afr_self_heal_t; typedef enum { AFR_DATA_TRANSACTION, /* truncate, write, ... */ @@ -438,32 +236,72 @@ typedef struct { char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ } afr_internal_lock_t; -typedef struct _afr_locked_fd { - fd_t *fd; - struct list_head list; -} afr_locked_fd_t; - struct afr_reply { int valid; int32_t op_ret; int32_t op_errno; + dict_t *xdata; + struct iatt poststat; + struct iatt postparent; + struct iatt prestat; + struct iatt preparent; + struct iatt preparent2; + struct iatt postparent2; + uint8_t checksum[MD5_DIGEST_LENGTH]; }; +typedef enum { + AFR_FD_NOT_OPENED, + AFR_FD_OPENED, + AFR_FD_OPENING +} afr_fd_open_status_t; + +typedef struct { + unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; + int inherited[AFR_NUM_CHANGE_LOGS]; + int on_disk[AFR_NUM_CHANGE_LOGS]; + afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ + + unsigned int *lock_piggyback; + unsigned int *lock_acquired; + + int flags; + + /* used for delayed-post-op optimization */ + pthread_mutex_t delay_lock; + gf_timer_t *delay_timer; + call_frame_t *delay_frame; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; + + /* @open_fd_count: + Number of open FDs queried from the server, as queried through + xdata in FOPs. Currently, used to decide if eager-locking must be + temporarily disabled. + */ + uint32_t open_fd_count; + + + /* list of frames currently in progress */ + struct list_head eager_locked; +} afr_fd_ctx_t; + + typedef struct _afr_local { - int uid; - int gid; + glusterfs_fop_t op; unsigned int call_count; - unsigned int success_count; - unsigned int enoent_count; - uint32_t open_fd_count; - gf_boolean_t update_open_fd_count; + /* @event_generation: copy of priv->event_generation taken at the + time of starting the transaction. The copy is made so that we + have a stable value through the various phases of the transaction. + */ + unsigned int event_generation; - unsigned int unhealable; - - unsigned int read_child_index; - unsigned char read_child_returned; - unsigned int first_up_child; + uint32_t open_fd_count; + gf_boolean_t update_open_fd_count; gf_lkowner_t saved_lk_owner; @@ -472,78 +310,117 @@ typedef struct _afr_local { int32_t **pending; + int dirty[AFR_NUM_CHANGE_LOGS]; + loc_t loc; loc_t newloc; fd_t *fd; + afr_fd_ctx_t *fd_ctx; - glusterfs_fop_t fop; - + /* @child_up: copy of priv->child_up taken at the time of transaction + start. The copy is taken so that we have a stable child_up array + through the phases of the transaction as priv->child_up[i] can keep + changing through time. + */ unsigned char *child_up; - int32_t *fresh_children; //in the order of response - int32_t *child_errno; + /* @read_attempted: + array of flags representing subvolumes where read operations of + the read transaction have already been attempted. The array is + first pre-filled with down subvolumes, and as reads are performed + on other subvolumes, those are set as well. This way if the read + operation fails we do not retry on that subvolume again. + */ + unsigned char *read_attempted; + + /* @readfn: - dict_t *xattr_req; + pointer to function which will perform the read operation on a given + subvolume. Used in read transactions. + */ - int32_t inodelk_count; - int32_t entrylk_count; + afr_read_txn_wind_t readfn; - afr_internal_lock_t internal_lock; + /* @refreshed: - afr_locked_fd_t *locked_fd; - int32_t source_child; - int32_t lock_recovery_child; + the inode was "refreshed" (i.e, pending xattrs from all subvols + freshly inspected and inode ctx updated accordingly) as part of + this transaction already. + */ + gf_boolean_t refreshed; + + /* @inode: + + the inode on which the read txn is performed on. ref'ed and copied + from either fd->inode or loc.inode + */ + + inode_t *inode; + + /* @parent[2]: + + parent inode[s] on which directory transactions are performed. + */ + + inode_t *parent; + inode_t *parent2; + + /* @readable: + + array of flags representing servers from which a read can be + performed. This is the output of afr_inode_refresh() + */ + unsigned char *readable; + + afr_inode_refresh_cbk_t refreshfn; + + /* @refreshinode: + + Inode currently getting refreshed. + */ + inode_t *refreshinode; + + /* + @pre_op_compat: + + compatibility mode of pre-op. send a separate pre-op and + op operations as part of transaction, rather than combining + */ + + gf_boolean_t pre_op_compat; + + dict_t *xattr_req; + + afr_internal_lock_t internal_lock; dict_t *dict; + int optimistic_change_log; gf_boolean_t delayed_post_op; - /* Is the current writev() going to perform a stable write? i.e, is fd->flags or @flags writev param have O_SYNC or O_DSYNC? */ - gf_boolean_t stable_write; - - /* This write appended to the file. Nnot necessarily O_APPEND, - just means the offset of write was at the end of file. - */ - gf_boolean_t append_write; - - int attempt_self_heal; - int foreground_self_heal; + gf_boolean_t stable_write; + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; - /* This struct contains the arguments for the "continuation" - (scheme-like) of fops + /* + This struct contains the arguments for the "continuation" + (scheme-like) of fops */ - int op; struct { struct { unsigned char buf_set; struct statvfs buf; } statfs; - struct { - uint32_t parent_entrylk; - uuid_t gfid_req; - inode_t *inode; - struct iatt buf; - struct iatt postparent; - dict_t **xattrs; - dict_t *xattr; - struct iatt *postparents; - struct iatt *bufs; - int32_t read_child; - int32_t *sources; - int32_t *success_children; - int32_t **pending_matrix; - gf_boolean_t fresh_lookup; - gf_boolean_t possible_spb; - } lookup; - struct { int32_t flags; } open; @@ -737,22 +614,67 @@ typedef struct _afr_local { afr_transaction_type type; - /* pre-compute the post piggyback status before - entering POST-OP phase - */ - int *postop_piggybacked; - /* stub to resume on destruction of the transaction frame */ call_stub_t *resume_stub; struct list_head eager_locked; - int32_t **txn_changelog;//changelog after pre+post ops unsigned char *pre_op; + /* @fop_subvols: subvolumes on which FOP will be attempted */ + unsigned char *fop_subvols; + + /* @failed_subvols: subvolumes on which FOP failed. Always + a subset of @fop_subvols */ + unsigned char *failed_subvols; + + /* @dirtied: flag which indicates whether we set dirty flag + in the OP. Typically true when we are performing operation + on more than one subvol and optimistic changelog is disabled + + A 'true' value set in @dirtied flag means an 'undirtying' + has to be done in POST-OP phase. + */ + gf_boolean_t dirtied; + + /* @inherited: flag which indicates that the dirty flags + of the previous transaction were inherited + */ + gf_boolean_t inherited; + + /* + @no_uninherit: flag which indicates that a pre_op_uninherit() + must _not_ be attempted (and returned as failure) always. This + flag is set when a hard pre-op is performed, but not accounted + for it in fd_ctx->on_disk[]. Such transactions are "isolated" + from the pre-op piggybacking entirely and therefore uninherit + must not be attempted. + */ + gf_boolean_t no_uninherit; + + /* @uninherit_done: + @uninherit_value: + + The above pair variables make pre_op_uninherit() idempotent. + Both are FALSE initially. The first call to pre_op_uninherit + sets @uninherit_done to TRUE and the return value to + @uninherit_value. Further calls will check for @uninherit_done + to be TRUE and if so will simply return @uninherit_value. + */ + gf_boolean_t uninherit_done; + gf_boolean_t uninherit_value; + + /* @changelog_resume: function to be called after changlogging + (either pre-op or post-op) is done + */ + + afr_changelog_resume_t changelog_resume; + call_frame_t *main_frame; + int (*wind) (call_frame_t *frame, xlator_t *this, int subvol); + int (*fop) (call_frame_t *frame, xlator_t *this); int (*done) (call_frame_t *frame, xlator_t *this); @@ -764,7 +686,7 @@ typedef struct _afr_local { /* post-op hook */ } transaction; - afr_self_heal_t self_heal; + syncbarrier_t barrier; struct marker_str marker; @@ -778,75 +700,58 @@ typedef struct _afr_local { struct afr_reply *replies; } afr_local_t; -typedef enum { - AFR_FD_NOT_OPENED, - AFR_FD_OPENED, - AFR_FD_OPENING -} afr_fd_open_status_t; - -typedef struct { - unsigned int *pre_op_done; - afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ - unsigned int *pre_op_piggyback; - - unsigned int *lock_piggyback; - unsigned int *lock_acquired; - - int flags; - uint64_t up_count; /* number of CHILD_UPs this fd has seen */ - uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ - - int32_t last_tried; - - int hit, miss; - gf_boolean_t failed_over; - struct list_head entries; /* needed for readdir failover */ - - unsigned char *locked_on; /* which subvolumes locks have been successful */ - - /* used for delayed-post-op optimization */ - pthread_mutex_t delay_lock; - gf_timer_t *delay_timer; - call_frame_t *delay_frame; - int call_child; - - /* set if any write on this fd was a non stable write - (i.e, without O_SYNC or O_DSYNC) - */ - gf_boolean_t witnessed_unstable_write; - - /* list of frames currently in progress */ - struct list_head eager_locked; -} afr_fd_ctx_t; - - -/* try alloc and if it fails, goto label */ -#define AFR_LOCAL_ALLOC_OR_GOTO(var, label) do { \ - var = mem_get0 (THIS->local_pool); \ - if (!var) { \ - gf_log (this->name, GF_LOG_ERROR, \ - "out of memory :("); \ - op_errno = ENOMEM; \ - goto label; \ - } \ - } while (0); - /* did a call fail due to a child failing? */ #define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ ((op_errno == ENOTCONN) || \ (op_errno == EBADFD))) -#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1) +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); -/* have we tried all children? */ -#define all_tried(i, count) ((i) == (count) - 1) +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvol, + int event_generation); +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int event_generation); -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid); +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this); int -pump_command_reply (call_frame_t *frame, xlator_t *this); +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, + unsigned char *readable); + +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, + int type); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, + int *event_p, afr_transaction_type type); + +#define afr_data_subvol_get(i, t, s, e) \ + afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION) + +#define afr_metadata_subvol_get(i, t, s, e) \ + afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION) + +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_inode_refresh_cbk_t cbk); int32_t afr_notify (xlator_t *this, int32_t event, void *data, void *data2); @@ -861,9 +766,6 @@ afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock); int afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); -int -afr_save_locked_fd (xlator_t *this, fd_t *fd); - int afr_mark_locked_nodes (xlator_t *this, fd_t *fd, unsigned char *locked_nodes); @@ -874,10 +776,6 @@ afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner); int afr_set_lock_number (call_frame_t *frame, xlator_t *this); - -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2); - int32_t afr_unlock (call_frame_t *frame, xlator_t *this); @@ -897,42 +795,26 @@ int afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count); -int pump_start (call_frame_t *frame, xlator_t *this); - int __afr_fd_ctx_set (xlator_t *this, fd_t *fd); int afr_fd_ctx_set (xlator_t *this, fd_t *fd); -int32_t -afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children); - -void -afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, - int32_t *fresh_children); +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this); int afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno); -unsigned int -afr_up_children_count (unsigned char *child_up, unsigned int child_count); - -unsigned int -afr_locked_children_count (unsigned char *children, unsigned int child_count); - -unsigned int -afr_pre_op_done_children_count (unsigned char *pre_op, - unsigned int child_count); +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); -gf_boolean_t -afr_is_fresh_lookup (loc_t *loc, xlator_t *this); +int +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode); void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent); - -int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); +afr_replies_wipe (afr_local_t *local, afr_private_t *priv); void afr_local_cleanup (afr_local_t *local, xlator_t *this); @@ -940,32 +822,16 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this); int afr_frame_return (call_frame_t *frame); -gf_boolean_t -afr_is_split_brain (xlator_t *this, inode_t *inode); - -void -afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, - afr_spb_state_t data_spb); - int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata); -void -afr_set_opendir_done (xlator_t *this, inode_t *inode); - -gf_boolean_t -afr_is_opendir_done (xlator_t *this, inode_t *inode); - void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this); int afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); -int -afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); - #define AFR_STACK_UNWIND(fop, frame, params ...) \ do { \ afr_local_t *__local = NULL; \ @@ -996,7 +862,16 @@ afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); } \ } while (0); -#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ +#define AFR_FRAME_INIT(frame, op_errno) \ + ({frame->local = mem_get0 (THIS->local_pool); \ + if (afr_local_init (frame->local, THIS->private, &op_errno)) { \ + afr_local_cleanup (frame->local, THIS); \ + mem_put (frame->local); \ + frame->local = NULL; }; \ + frame->local;}) + +#define AFR_STACK_RESET(frame) do { int opr; STACK_RESET (frame->root); AFR_FRAME_INIT(frame, opr);} while (0) + /* allocate and return a string that is the basename of argument */ static inline char * AFR_BASENAME (const char *str) @@ -1009,6 +884,9 @@ AFR_BASENAME (const char *str) return __basename_str; } +call_frame_t * +afr_copy_frame (call_frame_t *base); + int afr_transaction_local_init (afr_local_t *local, xlator_t *this); @@ -1016,9 +894,6 @@ int32_t afr_marker_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv ); -int32_t * -afr_children_create (int32_t child_count); - int afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno); @@ -1027,101 +902,20 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, transaction_lk_type_t lk_type); int -afr_first_up_child (unsigned char *child_up, size_t child_count); +afr_higher_errno (int32_t old_errno, int32_t new_errno); int -afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count, - int32_t prev_read_child, - int32_t config_read_child, int32_t *sources, - unsigned int hmode, uuid_t gfid); +afr_final_errno (afr_local_t *local, afr_private_t *priv); -void -afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, - int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child, uuid_t gfid); - -int32_t -afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, - int32_t *fresh_children, - int32_t *call_child, int32_t *last_index); - -int32_t -afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, - size_t child_count, int32_t *last_index, - int32_t read_child); -void -afr_get_fresh_children (int32_t *success_children, int32_t *sources, - int32_t *children, unsigned int child_count); -void -afr_children_add_child (int32_t *children, int32_t child, - int32_t child_count); -void -afr_children_rm_child (int32_t *children, int32_t child, - int32_t child_count); -void -afr_reset_children (int32_t *children, int32_t child_count); -int32_t -afr_most_important_error(int32_t old_errno, int32_t new_errno, - gf_boolean_t eio); int -afr_errno_count (int32_t *children, int *child_errno, - unsigned int child_count, int32_t op_errno); -int -afr_get_children_count (int32_t *children, unsigned int child_count); -gf_boolean_t -afr_is_child_present (int32_t *success_children, int32_t child_count, - int32_t child); -void -afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, - int32_t *success_children, - unsigned int child_count); -void -afr_reset_xattr (dict_t **xattr, unsigned int child_count); -gf_boolean_t -afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, - unsigned int child_count, const char *path, - const char *xlator_name); -unsigned int -afr_gfid_missing_count (const char *xlator_name, int32_t *children, - struct iatt *bufs, unsigned int child_count, - const char *path); -void -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path); -void -afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count); -afr_transaction_type -afr_transaction_type_get (ia_type_t ia_type); -int32_t -afr_resultant_errno_get (int32_t *children, - int *child_errno, unsigned int child_count); -void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, - int32_t *stale_children); -void -afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, - gf_boolean_t background, ia_type_t ia_type, char *reason, - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, - xlator_t *this), - int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed)); -void -afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open); +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req); void -afr_open_fd_fix (fd_t *fd, xlator_t *this); -int -afr_set_elem_count_get (unsigned char *elems, int child_count); +afr_fix_open (fd_t *fd, xlator_t *this); afr_fd_ctx_t * afr_fd_ctx_get (fd_t *fd, xlator_t *this); -gf_boolean_t -afr_open_only_data_self_heal (char *data_self_heal); - -gf_boolean_t -afr_data_self_heal_enabled (char *data_self_heal); - void afr_set_low_priority (call_frame_t *frame); int @@ -1137,22 +931,9 @@ afr_matrix_cleanup (int32_t **pending, unsigned int m); int32_t** afr_matrix_create (unsigned int m, unsigned int n); -gf_boolean_t -afr_is_errno_set (int *child_errno, int child); - -gf_boolean_t -afr_is_errno_unset (int *child_errno, int child); - -gf_boolean_t -afr_is_fd_fixable (fd_t *fd); - void -afr_prepare_new_entry_pending_matrix (int32_t **pending, - gf_boolean_t (*is_pending) (int *, int), - int *ctx, struct iatt *buf, - unsigned int child_count); -void -afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); +afr_filter_xattrs (dict_t *xattr); + /* * Special value indicating we should use the "auto" quorum method instead of * a fixed value (including zero to turn off quorum enforcement). @@ -1172,28 +953,6 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); } \ } while (0); - -#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO." - -#define AFR_SBRAIN_CHECK_FD(fd, label) do { \ - if (fd->inode && afr_is_split_brain (this, fd->inode)) { \ - op_errno = EIO; \ - gf_log (this->name, GF_LOG_WARNING, \ - AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid)); \ - goto label; \ - } \ -} while (0) - -#define AFR_SBRAIN_CHECK_LOC(loc, label) do { \ - if (loc->inode && afr_is_split_brain (this, loc->inode)) { \ - op_errno = EIO; \ - loc_path (loc, NULL); \ - gf_log (this->name, GF_LOG_WARNING, \ - AFR_SBRAIN_MSG , loc->path); \ - goto label; \ - } \ -} while (0) - int afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); @@ -1209,7 +968,7 @@ afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); void afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); -afr_inode_ctx_t* -afr_inode_ctx_get (inode_t *inode, xlator_t *this); +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *is_local); #endif /* __AFR_H__ */ -- cgit