diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr.h')
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 284 |
1 files changed, 226 insertions, 58 deletions
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 7de8d8243..21064db58 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -28,6 +28,10 @@ #define AFR_XATTR_PREFIX "trusted.afr" #define AFR_PATHINFO_HEADER "REPLICATE:" #define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" +#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" + +#define AFR_LOCKEE_COUNT_MAX 3 +#define AFR_DOM_COUNT_MAX 3 struct _pump_private; @@ -58,10 +62,8 @@ typedef enum { AFR_INODE_SET_READ_CTX = 1, AFR_INODE_RM_STALE_CHILDREN, AFR_INODE_SET_OPENDIR_DONE, - AFR_INODE_SET_SPLIT_BRAIN, AFR_INODE_GET_READ_CTX, AFR_INODE_GET_OPENDIR_DONE, - AFR_INODE_GET_SPLIT_BRAIN, } afr_inode_op_t; typedef struct afr_inode_params_ { @@ -75,29 +77,41 @@ typedef struct afr_inode_params_ { } u; } afr_inode_params_t; +typedef enum afr_spb_state { + DONT_KNOW, + SPB, + NO_SPB +} afr_spb_state_t; + typedef struct afr_inode_ctx_ { uint64_t masks; int32_t *fresh_children;//increasing order of latency + afr_spb_state_t mdata_spb; + afr_spb_state_t data_spb; + uint32_t open_fd_count; } afr_inode_ctx_t; typedef enum { NONE, INDEX, + INDEX_TO_BE_HEALED, FULL, } afr_crawl_type_t; typedef struct afr_self_heald_ { - gf_boolean_t enabled; - gf_boolean_t iamshd; - afr_crawl_type_t *pending; - gf_boolean_t *inprogress; - afr_child_pos_t *pos; - gf_timer_t **timer; - eh_t *healed; - eh_t *heal_failed; - eh_t *split_brain; - char *node_uuid; - int timeout; + gf_boolean_t enabled; + gf_boolean_t iamshd; + afr_crawl_type_t *pending; + gf_boolean_t *inprogress; + afr_child_pos_t *pos; + gf_timer_t **timer; + eh_t *healed; + eh_t *heal_failed; + eh_t *split_brain; + eh_t **statistics; + void **crawl_events; + char *node_uuid; + int timeout; } afr_self_heald_t; typedef struct _afr_private { @@ -162,9 +176,38 @@ typedef struct _afr_private { gf_boolean_t did_discovery; gf_boolean_t readdir_failover; uint64_t sh_readdir_size; + gf_boolean_t ensure_durability; + char *sh_domain; } afr_private_t; +typedef enum { + AFR_SELF_HEAL_NOT_ATTEMPTED, + AFR_SELF_HEAL_STARTED, + AFR_SELF_HEAL_FAILED, + AFR_SELF_HEAL_SYNC_BEGIN, +} afr_self_heal_status; + typedef struct { + afr_self_heal_status gfid_or_missing_entry_self_heal; + afr_self_heal_status metadata_self_heal; + afr_self_heal_status data_self_heal; + afr_self_heal_status entry_self_heal; +} afr_sh_status_for_all_type; + +typedef enum { + AFR_SELF_HEAL_ENTRY, + AFR_SELF_HEAL_METADATA, + AFR_SELF_HEAL_DATA, + AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY, + AFR_SELF_HEAL_INVALID = -1, +} afr_self_heal_type; + +typedef enum { + AFR_CHECK_ALL, + AFR_CHECK_SPECIFIC, +} afr_sh_fail_check_type; + +struct afr_self_heal_ { /* External interface: These are variables (some optional) that are set by whoever has triggered self-heal */ @@ -241,10 +284,10 @@ typedef struct { const char *linkname; gf_boolean_t entries_skipped; - int op_failed; gf_boolean_t actual_sh_started; gf_boolean_t sync_done; gf_boolean_t data_lock_held; + gf_boolean_t sh_dom_lock_held; gf_boolean_t eof_reached; fd_t *healing_fd; int file_has_holes; @@ -255,27 +298,32 @@ typedef struct { uint8_t *checksum; afr_post_remove_call_t post_remove_call; - loc_t parent_loc; + char *data_sh_info; + char *metadata_sh_info; + loc_t parent_loc; call_frame_t *orig_frame; call_frame_t *old_loop_frame; gf_boolean_t unwound; afr_sh_algo_private_t *private; + afr_sh_status_for_all_type afr_all_sh_status; + afr_self_heal_type sh_type_in_action; struct afr_sh_algorithm *algo; afr_lock_cbk_t data_lock_success_handler; afr_lock_cbk_t data_lock_failure_handler; + gf_boolean_t data_lock_block; int (*completion_cbk) (call_frame_t *frame, xlator_t *this); int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this); int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); - gf_boolean_t mdata_spb; - gf_boolean_t data_spb; call_frame_t *sh_frame; -} afr_self_heal_t; +}; + +typedef struct afr_self_heal_ afr_self_heal_t; typedef enum { AFR_DATA_TRANSACTION, /* truncate, write, ... */ @@ -337,11 +385,31 @@ afr_index_for_transaction_type (afr_transaction_type type) return -1; /* make gcc happy */ } +typedef struct { + loc_t loc; + char *basename; + unsigned char *locked_nodes; + int locked_count; + +} afr_entry_lockee_t; + +int +afr_entry_lockee_cmp (const void *l1, const void *l2); + +typedef struct { + char *domain; /* Domain on which inodelk is taken */ + struct gf_flock flock; + unsigned char *locked_nodes; + int32_t lock_count; +} afr_inodelk_t; typedef struct { loc_t *lk_loc; - struct gf_flock lk_flock; + int lockee_count; + afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; + + afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; const char *lk_basename; const char *lower_basename; const char *higher_basename; @@ -350,23 +418,22 @@ typedef struct { unsigned char *locked_nodes; unsigned char *lower_locked_nodes; - unsigned char *inode_locked_nodes; - unsigned char *entry_locked_nodes; selfheal_lk_type_t selfheal_lk_type; transaction_lk_type_t transaction_lk_type; int32_t lock_count; - int32_t inodelk_lock_count; int32_t entrylk_lock_count; uint64_t lock_number; int32_t lk_call_count; int32_t lk_expected_count; + int32_t lk_attempted_count; int32_t lock_op_ret; int32_t lock_op_errno; afr_lock_cbk_t lock_cbk; + char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ } afr_internal_lock_t; typedef struct _afr_locked_fd { @@ -386,9 +453,11 @@ typedef struct _afr_local { unsigned int call_count; unsigned int success_count; unsigned int enoent_count; + uint32_t open_fd_count; + gf_boolean_t update_open_fd_count; - unsigned int govinda_gOvinda; + unsigned int unhealable; unsigned int read_child_index; unsigned char read_child_returned; @@ -405,7 +474,6 @@ typedef struct _afr_local { loc_t newloc; fd_t *fd; - unsigned char *fd_open_on; glusterfs_fop_t fop; @@ -429,12 +497,23 @@ typedef struct _afr_local { int optimistic_change_log; gf_boolean_t delayed_post_op; - gf_boolean_t fop_paused; - int (*fop_call_continue) (call_frame_t *frame, xlator_t *this); - /* - This struct contains the arguments for the "continuation" - (scheme-like) of fops + /* Is the current writev() going to perform a stable write? + i.e, is fd->flags or @flags writev param have O_SYNC or + O_DSYNC? + */ + gf_boolean_t stable_write; + + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; + + int allow_sh_for_running_transaction; + + + /* This struct contains the arguments for the "continuation" + (scheme-like) of fops */ int op; @@ -530,7 +609,9 @@ typedef struct _afr_local { struct { struct iatt prebuf; struct iatt postbuf; + } inode_wfop; //common structure for all inode-write-fops + struct { int32_t op_ret; struct iovec *vector; @@ -541,34 +622,21 @@ typedef struct _afr_local { } writev; struct { - struct iatt prebuf; - struct iatt postbuf; - } fsync; - - struct { off_t offset; - struct iatt prebuf; - struct iatt postbuf; } truncate; struct { off_t offset; - struct iatt prebuf; - struct iatt postbuf; } ftruncate; struct { struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } setattr; struct { struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } fsetattr; struct { @@ -630,11 +698,32 @@ typedef struct _afr_local { dict_t *params; char *linkpath; } symlink; + + struct { + int32_t mode; + off_t offset; + size_t len; + } fallocate; + + struct { + off_t offset; + size_t len; + } discard; + + struct { + off_t offset; + size_t len; + struct iatt prebuf; + struct iatt postbuf; + } zerofill; + + } cont; struct { off_t start, len; + gf_boolean_t eager_lock_on; int *eager_lock; char *basename; @@ -645,6 +734,17 @@ typedef struct _afr_local { afr_transaction_type type; + /* pre-compute the post piggyback status before + entering POST-OP phase + */ + int *postop_piggybacked; + + /* stub to resume on destruction + of the transaction frame */ + call_stub_t *resume_stub; + + struct list_head eager_locked; + int32_t **txn_changelog;//changelog after pre+post ops unsigned char *pre_op; @@ -682,11 +782,6 @@ typedef enum { } afr_fd_open_status_t; typedef struct { - struct list_head call_list; - call_frame_t *frame; -} afr_fd_paused_call_t; - -typedef struct { unsigned int *pre_op_done; afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ unsigned int *pre_op_piggyback; @@ -705,13 +800,20 @@ typedef struct { struct list_head entries; /* needed for readdir failover */ unsigned char *locked_on; /* which subvolumes locks have been successful */ - struct list_head paused_calls; /* queued calls while fix_open happens */ /* used for delayed-post-op optimization */ pthread_mutex_t delay_lock; gf_timer_t *delay_timer; call_frame_t *delay_frame; int call_child; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; + + /* list of frames currently in progress */ + struct list_head eager_locked; } afr_fd_ctx_t; @@ -747,6 +849,13 @@ int32_t afr_notify (xlator_t *this, int32_t event, void *data, void *data2); int +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, + loc_t *loc, char *basename, int child_count); + +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock); + +int afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); int @@ -781,8 +890,8 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this); int afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); -void -afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count); int pump_start (call_frame_t *frame, xlator_t *this); @@ -832,7 +941,8 @@ gf_boolean_t afr_is_split_brain (xlator_t *this, inode_t *inode); void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set); +afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, + afr_spb_state_t data_spb); int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, @@ -947,8 +1057,9 @@ afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count); void afr_reset_children (int32_t *children, int32_t child_count); -gf_boolean_t -afr_error_more_important (int32_t old_errno, int32_t new_errno); +int32_t +afr_most_important_error(int32_t old_errno, int32_t new_errno, + gf_boolean_t eio); int afr_errno_count (int32_t *children, int *child_errno, unsigned int child_count, int32_t op_errno); @@ -991,11 +1102,11 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, int32_t sh_failed)); -int -afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx, - int need_open_count, int *need_open); -int -afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop); +void +afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open); + +void +afr_open_fd_fix (fd_t *fd, xlator_t *this); int afr_set_elem_count_get (unsigned char *elems, int child_count); @@ -1022,6 +1133,23 @@ afr_matrix_cleanup (int32_t **pending, unsigned int m); int32_t** afr_matrix_create (unsigned int m, unsigned int n); + +gf_boolean_t +afr_is_errno_set (int *child_errno, int child); + +gf_boolean_t +afr_is_errno_unset (int *child_errno, int child); + +gf_boolean_t +afr_is_fd_fixable (fd_t *fd); + +void +afr_prepare_new_entry_pending_matrix (int32_t **pending, + gf_boolean_t (*is_pending) (int *, int), + int *ctx, struct iatt *buf, + unsigned int child_count); +void +afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); /* * Special value indicating we should use the "auto" quorum method instead of * a fixed value (including zero to turn off quorum enforcement). @@ -1041,4 +1169,44 @@ afr_matrix_create (unsigned int m, unsigned int n); } \ } while (0); + +#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO." + +#define AFR_SBRAIN_CHECK_FD(fd, label) do { \ + if (fd->inode && afr_is_split_brain (this, fd->inode)) { \ + op_errno = EIO; \ + gf_log (this->name, GF_LOG_WARNING, \ + AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid)); \ + goto label; \ + } \ +} while (0) + +#define AFR_SBRAIN_CHECK_LOC(loc, label) do { \ + if (loc->inode && afr_is_split_brain (this, loc->inode)) { \ + op_errno = EIO; \ + loc_path (loc, NULL); \ + gf_log (this->name, GF_LOG_WARNING, \ + AFR_SBRAIN_MSG , loc->path); \ + goto label; \ + } \ +} while (0) + +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); + +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); + +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); + +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); + +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this); + #endif /* __AFR_H__ */ |
