diff options
Diffstat (limited to 'xlators/cluster')
27 files changed, 3362 insertions, 104 deletions
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am index 903fbb39f12..bce94bb8b3b 100644 --- a/xlators/cluster/Makefile.am +++ b/xlators/cluster/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = stripe afr dht ec +SUBDIRS = aha stripe afr dht ec CLEANFILES = diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index c324c02c008..ebadba99a05 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -43,6 +43,13 @@ #include "afr-self-heald.h" #include "afr-messages.h" +#define CHILD_UP_STR "UP" +#define CHILD_DOWN_STR "DOWN" +#define CHILD_DISCONNECTED_STR "DOWN" + +static int32_t +find_hybrid_children (xlator_t *this, unsigned char *fastest_children); + call_frame_t * afr_copy_frame (call_frame_t *base) { @@ -1457,21 +1464,75 @@ afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode) sizeof(gfid_copy)) % child_count; } +/* + * afr_halo_read_subvol + * + * Given a array representing the readable children, this function will + * return which one of the readable children meet the halo hybrid criteria. + * In the event none are found, -1 is returned and another strategy will have + * to be used to figure out where the read should come from. + */ +int afr_halo_read_subvol (xlator_t *this, unsigned char *readable) { + afr_private_t *priv = NULL; + unsigned char *hybrid_children; + int32_t hybrid_cnt = 0; + int read_subvol = -1; + int i = 0; + + priv = this->private; + + /* Halo in-active or hybrid mode disabled, bail.... */ + if (!priv->halo_enabled || !priv->halo_hybrid_mode) + return -1; + + /* AFR Discovery edge case, if you are already pinned to a child + * which meets the latency threshold then go with this child for + * consistency purposes. + */ + if (priv->read_child >= 0 && readable[priv->read_child] && + priv->child_latency[priv->read_child] <= + AFR_HALO_HYBRID_LATENCY_MSEC) { + return priv->read_child; + } + + hybrid_children = alloca0 (priv->child_count); + hybrid_cnt = find_hybrid_children (this, hybrid_children); + if (hybrid_cnt) { + for (i = 0; i < priv->child_count; i++) { + if (readable[i] && hybrid_children[i]) { + read_subvol = i; + priv->read_child = read_subvol; + gf_log (this->name, GF_LOG_TRACE, + "Selected hybrid child %d for reads", + i); + break; + } + } + } + + return read_subvol; +} + int afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, unsigned char *readable, afr_read_subvol_args_t *args) { - int i = 0; - int read_subvol = -1; - afr_private_t *priv = NULL; + int i = 0; + int read_subvol = -1; + afr_private_t *priv = NULL; afr_read_subvol_args_t local_args = {0,}; - priv = this->private; + priv = this->private; + + /* Choose lowest latency child for reads */ + read_subvol = afr_halo_read_subvol (this, readable); + if (read_subvol != -1) + return read_subvol; - /* first preference - explicitly specified or local subvolume */ - if (priv->read_child >= 0 && readable[priv->read_child]) + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) return priv->read_child; if (inode_is_linked (inode)) { @@ -1497,7 +1558,6 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, return -1; } - int afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, unsigned char *readable, int *event_p, @@ -2157,6 +2217,13 @@ afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv->children[child_index]->name); priv->read_child = child_index; + } else if (priv->halo_enabled) { + if (priv->read_child < 0) { + priv->read_child = child_index; + } else if (priv->child_latency[child_index] < + priv->child_latency[priv->read_child]) { + priv->read_child = child_index; + } } out: STACK_DESTROY(frame->root); @@ -2348,7 +2415,6 @@ unwind: return 0; } - int afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) { @@ -2574,6 +2640,7 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err) afr_local_t *local = NULL; afr_private_t *priv = NULL; int call_count = 0; + unsigned char *hybrid_children = NULL; local = frame->local; priv = this->private; @@ -2584,8 +2651,19 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err) goto out; } - call_count = local->call_count = AFR_COUNT (local->child_up, - priv->child_count); + hybrid_children = alloca0 (priv->child_count); + call_count = find_hybrid_children (this, hybrid_children); + if (call_count) { + for (i = 0; i < priv->child_count; i++) + local->child_up[i] = hybrid_children[i]; + gf_log (this->name, GF_LOG_TRACE, "Selected %d hybrid " + "children for LOOKUPs", call_count); + } else { + hybrid_children = NULL; + call_count = AFR_COUNT (local->child_up, priv->child_count); + } + + local->call_count = call_count; ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, &local->loc); @@ -2818,6 +2896,15 @@ afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) afr_read_subvol_get (loc->parent, this, NULL, NULL, &event, AFR_DATA_TRANSACTION, NULL); + /* So this is the "secret" to why "Hybrid" halo works. Encoded in + * the cached inodes, we store what is effectively the "generational" + * state of the cluster along with a "packed" version of the extended + * attributes which determine which nodes are wise/fools. We can + * consult these cached values to figure out who we can trust, in the + * event the state of our cluster changes and we can no longer trust + * the cached info we "refresh" the inode (and hit all regions) to + * ensure we know which bricks we can safely read from. + */ if (event != local->event_generation) afr_inode_refresh (frame, this, loc->parent, NULL, afr_lookup_do); @@ -3042,7 +3129,7 @@ afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); if (call_count == 0) AFR_STACK_UNWIND (flush, frame, local->op_ret, @@ -4290,25 +4377,569 @@ __get_heard_from_all_status (xlator_t *this) return heard_from_all; } +/* + * afr_cmp_child + * + * Passed to the qsort function to order a list of children by the latency + * and/or up/down states. + * + * Note: This isn't as simple as taking the latencies and calling it a + * a day. Children can be marked down, which overrides their latency + * signal. Having a lower-latency child available doesn't guarentee this + * child shall be marked up: we don't want to constantly be swapping + * slightly better bricks for others...this is jarring to clients and + * could cause all sorts of issues. Plus, the fail-over, max-replicas + * flags must all be honored which manage the up/down state of children. + * + * In short, the (as marked) up/down down state of the brick shall always + * take precedence when sorting by latency. + */ +static int +_afr_cmp_child (const void *child1, const void *child2) +{ + struct afr_child *child11 = (struct afr_child *)child1; + struct afr_child *child22 = (struct afr_child *)child2; + + /* If both children are _marked_ down they are equal */ + if (!child11->child_up && !child22->child_up) + return 0; + + /* Prefer child 2, child 1 is _marked_ down, child 2 is not */ + if (!child11->child_up && child22->child_up) + return 1; + + /* Prefer child 1, child 2 is _marked_ down, child 1 is not */ + if (child11->child_up && !child22->child_up) + return -1; + + if (child11->latency > child22->latency) { + return 1; + } + if (child11->latency == child22->latency) { + return 0; + } + return -1; +} + +/* + * find_hybrid_children + * + * Given a char array representing our children (aka bricks within our AFR + * AFR "subvolume"), we'll mark this array with the children which are + * within the halo_hybrid_read_max_latency_sec or if none fit this condition, + * we'll pick the fastest two bricks. + * + * You might ask, why not just pick the quickest brick and be done with it? + * Well, being within our set is not suffcient to be chosen for the read, + * we must also be marked "readable", we still want to choose as many as + * we can within our local region to ensure we have somebody that is readable. + * + * To illustrate this, consider the case where a 1/2 bricks received a sync + * from some other writer, and the 2nd brick although faster wasn't present. + * In this case we'll want to use the slower brick to service the read. + * + * In short, this function just tells the caller which hybrid children, + * it gives no signal as to their readability, nor should it since this is + * handled later in the various flows (e.g. by afr_halo_read_subvol). + */ +static int32_t +find_hybrid_children (xlator_t *this, unsigned char *hybrid_children) +{ + int32_t i = 0; + afr_private_t *priv = NULL; + struct afr_child *sorted_list = NULL; + uint32_t max_latency; + uint32_t limit = AFR_HALO_HYBRID_CHILD_LIMIT; + + priv = this->private; + + if (!priv->halo_enabled || !priv->halo_hybrid_mode) + return 0; + + if (limit > priv->child_count) + limit = priv->child_count; + + max_latency = priv->halo_hybrid_read_max_latency_msec; + + sorted_list = alloca (sizeof (struct afr_child) * priv->child_count); + + /* Find children meeting the latency threshold */ + for (i = 0; i < priv->child_count; i++) { + sorted_list[i].idx = i; + sorted_list[i].child_up = priv->child_up[i]; + sorted_list[i].latency = priv->child_latency[i]; + } + + /* QuickSort the children according to latency */ + qsort (sorted_list, priv->child_count, sizeof (struct afr_child), + _afr_cmp_child); + + i = 0; + while (i < priv->child_count && sorted_list[i].latency <= max_latency) + hybrid_children[sorted_list[i++].idx] = 1; + + /* Found some candidates */ + if (i != 0) + return i; + + /* If no candidates can be found meeting the max_latency threshold + * then find the best of those we have to our limit. + */ + for (i = 0; i < limit; i++) + hybrid_children[sorted_list[i].idx] = 1; + + return i; +} + +int +find_best_down_child (xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = -1; + int32_t best_child = -1; + int64_t best_latency = INT64_MAX; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!priv->child_up[i] && + priv->child_latency[i] >= 0 && + priv->child_latency[i] < best_latency) { + best_child = i; + best_latency = priv->child_latency[i]; + } + } + if (best_child >= 0) { + gf_log (this->name, GF_LOG_DEBUG, "Found best down child (%d) " + "@ %ld ms latency", best_child, best_latency); + } + return best_child; +} + +int +find_worst_up_child (xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = -1; + int32_t worst_child = -1; + int64_t worst_latency = INT64_MIN; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && + priv->child_latency[i] >= 0 && + priv->child_latency[i] >= worst_latency) { + worst_child = i; + worst_latency = priv->child_latency[i]; + } + } + if (worst_child >= 0) { + gf_log (this->name, GF_LOG_DEBUG, "Found worst up child (%d)" + " @ %ld ms latency", worst_child, worst_latency); + } + return worst_child; +} + +static const char *halo_state_str(int i) +{ + switch (i) { + case 0: return "DOWN"; + case 1: return "UP"; + } + + return "unknown"; +} + + +static void dump_halo_states (xlator_t *this) { + afr_private_t *priv = NULL; + int i = -1; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_latency[i] == AFR_CHILD_DOWN_LATENCY) { + gf_log (this->name, GF_LOG_DEBUG, + "Child %d halo state: %s (N/A)", + i, + halo_state_str(priv->child_up[i])); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "Child %d halo state: %s (%"PRIi64" ms)", + i, + halo_state_str(priv->child_up[i]), + priv->child_latency[i]); + } + } +} + +static void +_afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator, + const int idx, const int64_t halo_max_latency_msec, + int32_t *event, int64_t *child_latency_msec, + gf_boolean_t child_halo_enabled) +{ + afr_private_t *priv = NULL; + int i = -1; + int up_children = 0; + int best_down_child = 0; + uint64_t latency_samples = 0; + + priv = this->private; + + /* Base it off the _minimum_ latency we've ever seen */ + *child_latency_msec = child_xlator->client_latency.min / 1000.0; + latency_samples = child_xlator->client_latency.count; + priv->child_latency[idx] = *child_latency_msec; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] == 1) { + up_children++; + } + } + + /* Don't do anything until you have some minimum numbner of + * latency samples */ + if (priv->halo_enabled == _gf_true && child_halo_enabled == _gf_false) { + gf_log (child_xlator->name, GF_LOG_INFO, "In-sufficient " + " number of latency samples (%" PRIu64 + " < %d), halo in-active.", + latency_samples, priv->halo_min_samples); + } + + gf_log (this->name, GF_LOG_DEBUG, + "ping: child %u (%s) latency %"PRIu64" ms (max %"PRIu64" ms)" + " up_count %d (min %d) enabled %s", + idx, child_xlator ? child_xlator->name : "<null>", + *child_latency_msec, + halo_max_latency_msec, + up_children, + priv->halo_min_replicas, + child_halo_enabled ? "true" : "false"); + + /* + * Case 1: This child's latency exceeds the maximum allowable + * for this halo. + */ + if (child_halo_enabled && + *child_latency_msec > halo_max_latency_msec && + priv->child_up[idx] == 1 && + up_children > priv->halo_min_replicas) { + if (find_worst_up_child (this) == idx) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%"PRIi64"ms) " + "exceeds halo threshold (%"PRIi64"), " + "marking child down, " + "min_replicas (%d) still " + "satisfied.", + *child_latency_msec, + halo_max_latency_msec, + priv->halo_min_replicas); + *event = GF_EVENT_CHILD_DOWN; + } + /* + * Case 2: Child latency is within halo and currently marked down, + * mark it up. + */ + } else if ((child_halo_enabled == _gf_false || + *child_latency_msec <= halo_max_latency_msec) && + priv->child_up[idx] == 0) { + if (child_halo_enabled == _gf_false || + up_children < priv->halo_max_replicas) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%ld ms) " + "below halo threshold (%ld) or halo is " + "disabled, marking child up.", + *child_latency_msec, + halo_max_latency_msec); + *event = GF_EVENT_CHILD_UP; + } else { + gf_log (child_xlator->name, GF_LOG_INFO, + "Not marking child %d up, " + "max replicas (%d) reached.", idx, + priv->halo_max_replicas); + } + /* + * Case 3: Child latency is within halo,and currently marked up, + * mark it down if it's the highest latency child and the + * number of up children is greater than halo_max_replicas. + * UNLESS you are an SHD in which case do nothing. + */ + } else if ((child_halo_enabled == _gf_true && + *child_latency_msec <= halo_max_latency_msec) && + priv->child_up[idx] == 1) { + if (find_worst_up_child (this) == idx && + up_children > priv->halo_max_replicas && + !priv->shd.iamshd) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%"PRIi64"ms) " + "exceeds halo threshold (%"PRIi64"), " + "but halo_max_replicas (%d) exceeded, " + "marking child down.", + *child_latency_msec, + halo_max_latency_msec, + priv->halo_max_replicas); + *event = GF_EVENT_CHILD_DOWN; + } + } + + if (*event != GF_EVENT_CHILD_PING && + gf_log_get_loglevel () >= GF_LOG_DEBUG) { + gf_log (this->name, GF_LOG_DEBUG, "Initial halo states:"); + dump_halo_states (this); + } +} + +void +_afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, + const int idx, int64_t halo_max_latency_msec, + int32_t *event, int32_t *call_psh, int32_t *up_child, + gf_boolean_t child_halo_enabled) +{ + afr_private_t *priv = NULL; + int i = -1; + int up_children = 0; + int worst_up_child = -1; + gf_boolean_t was_down = _gf_false; + + priv = this->private; + + /* + * This only really counts if the child was never up + * (value = -1) or had been down (value = 0). See + * comment at GF_EVENT_CHILD_DOWN for a more detailed + * explanation. + */ + if (priv->child_up[idx] != 1) { + /* + * Track the fact we did this, we may need to repeal this + * if we later decide to mark this brick down. + */ + was_down = _gf_true; + priv->event_generation++; + } + priv->child_up[idx] = 1; + + *call_psh = 1; + *up_child = idx; + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 1) + up_children++; + + /* + * Handle the edge case where we exceed + * halo_min_replicas and we've got a child which is + * marked up as it was helping to satisfy the + * halo_min_replicas even though it's latency exceeds + * halo_max_latency_msec. + */ + if (child_halo_enabled == _gf_true && + up_children > priv->halo_min_replicas) { + worst_up_child = find_worst_up_child (this); + if (worst_up_child >= 0 && + priv->child_latency[worst_up_child] > + halo_max_latency_msec) { + if (was_down == _gf_true) + priv->event_generation--; + *call_psh = 0; + priv->child_up[worst_up_child] = 0; + up_children--; + gf_log (this->name, GF_LOG_DEBUG, + "Marking child %d down, " + "doesn't meet halo threshold " + "(%ld), and > " + "halo_min_replicas (%d)", + worst_up_child, + halo_max_latency_msec, + priv->halo_min_replicas); + goto out; + } + } + if (priv->halo_enabled && + up_children > priv->halo_max_replicas && + !priv->shd.iamshd) { + if (was_down == _gf_true) + priv->event_generation--; + *call_psh = 0; + worst_up_child = find_worst_up_child (this); + if (worst_up_child < 0) { + worst_up_child = idx; + } + priv->child_up[worst_up_child] = 0; + gf_log (this->name, GF_LOG_INFO, + "Marking child %d down, " + "up_children (%d) > " + "halo_max_replicas (%d)", + worst_up_child, + up_children, + priv->halo_max_replicas); + up_children--; + goto out; + } +out: + if (up_children == 1) { + gf_log (this->name, GF_LOG_INFO, + "Subvolume '%s' came back up; " + "going online.", + child_xlator->name); + } else { + *event = GF_EVENT_CHILD_MODIFIED; + } + + priv->last_event[idx] = *event; + + if (gf_log_get_loglevel () >= GF_LOG_DEBUG) { + gf_log (this->name, GF_LOG_DEBUG, "New halo states:"); + dump_halo_states (this); + } +} + +void +_afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator, + int idx, int64_t child_latency_msec, + int64_t halo_max_latency_msec, int32_t *event, + int32_t *call_psh, int32_t *up_child, + gf_boolean_t child_halo_enabled) +{ + afr_private_t *priv = NULL; + int i = -1; + int up_children = 0; + int down_children = 0; + int best_down_child = -1; + gf_boolean_t swap_child = _gf_false; + + priv = this->private; + + /* + * If a brick is down when we start, we'll get a + * CHILD_DOWN to indicate its initial state. There + * was never a CHILD_UP in this case, so if we + * increment "down_count" the difference between than + * and "up_count" will no longer be the number of + * children that are currently up. This has serious + * implications e.g. for quorum enforcement, so we + * don't increment these values unless the event + * represents an actual state transition between "up" + * (value = 1) and anything else. + */ + if (priv->child_up[idx] == 1) { + priv->event_generation++; + } + + /* + * If this is an _actual_ CHILD_DOWN event, we + * want to set the child_latency to AFR_CHILD_DOWN_LATENCY to + * indicate the child is really disconnected. + */ + if (child_latency_msec == AFR_CHILD_DOWN_LATENCY) { + priv->child_latency[idx] = AFR_CHILD_DOWN_LATENCY; + } + priv->child_up[idx] = 0; + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 1) + up_children++; + + /* + * Handle the edge case where we need to find the + * next best child (to mark up) as marking this child + * down would cause us to fall below halo_min_replicas. + * We will also force the SHD to heal this child _now_ + * as we want it to be up to date if we are going to + * begin using it synchronously. + */ + best_down_child = find_best_down_child (this); + if (child_halo_enabled == _gf_true) { + if (up_children < priv->halo_min_replicas && + priv->halo_failover_enabled == _gf_true) + swap_child = _gf_true; + else if (up_children < priv->halo_max_replicas && + priv->child_latency[best_down_child] <= + halo_max_latency_msec && + priv->halo_failover_enabled == _gf_true) + swap_child = _gf_true; + } + + if (swap_child) { + if (best_down_child >= 0) { + gf_log (this->name, GF_LOG_INFO, + "Swapping out child %d for " + "child %d to satisfy " + "halo_min_replicas (%d).", + idx, best_down_child, + priv->halo_min_replicas); + priv->child_up[best_down_child] = 1; + *call_psh = 1; + *up_child = best_down_child; + } + } + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 0) + down_children++; + if (down_children == priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, + "All subvolumes are down. Going " + "offline until atleast one of them " + "comes back up."); + } else { + *event = GF_EVENT_CHILD_MODIFIED; + } + priv->last_event[idx] = *event; + + if (gf_log_get_loglevel () >= GF_LOG_DEBUG) { + gf_log (this->name, GF_LOG_DEBUG, "New halo states:"); + dump_halo_states (this); + } +} + +int64_t +_afr_get_halo_latency (xlator_t *this) +{ + afr_private_t *priv = NULL; + int64_t halo_max_latency_msec = 0; + + priv = this->private; + + if (priv->shd.iamshd) { + halo_max_latency_msec = priv->shd.halo_max_latency_msec; + } else if (priv->nfsd.iamnfsd) { + halo_max_latency_msec = + priv->nfsd.halo_max_latency_msec; + } else { + halo_max_latency_msec = priv->halo_max_latency_msec; + } + gf_log (this->name, GF_LOG_DEBUG, "Using halo latency %ld", + halo_max_latency_msec); + return halo_max_latency_msec; +} + + int32_t afr_notify (xlator_t *this, int32_t event, void *data, void *data2) { + xlator_t *child_xlator = NULL; afr_private_t *priv = NULL; int i = -1; - int up_children = 0; - int down_children = 0; int propagate = 0; int had_heard_from_all = 0; int have_heard_from_all = 0; int idx = -1; int ret = -1; int call_psh = 0; + int up_child = -1; + uint64_t latency_samples = 0; dict_t *input = NULL; dict_t *output = NULL; gf_boolean_t had_quorum = _gf_false; gf_boolean_t has_quorum = _gf_false; + int64_t halo_max_latency_msec = 0; + int64_t child_latency_msec = AFR_CHILD_DOWN_LATENCY; + gf_boolean_t child_halo_enabled = _gf_false; + child_xlator = (xlator_t *)data; priv = this->private; if (!priv) @@ -4321,7 +4952,7 @@ afr_notify (xlator_t *this, int32_t event, * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. */ priv->did_discovery = _gf_false; - + latency_samples = child_xlator->client_latency.count; /* parent xlators dont need to know about every child_up, child_down * because of afr ha. If all subvolumes go down, child_down has @@ -4332,7 +4963,7 @@ afr_notify (xlator_t *this, int32_t event, * subsequent revalidate lookup happens on all the dht's subvolumes * which triggers afr self-heals if any. */ - idx = find_child_index (this, data); + idx = find_child_index (this, child_xlator); if (idx < 0) { gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP, "Received child_up from invalid subvolume"); @@ -4341,6 +4972,28 @@ afr_notify (xlator_t *this, int32_t event, had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up, this); + + if (!priv->halo_enabled || + latency_samples < priv->halo_min_samples) { + child_halo_enabled = _gf_false; + halo_max_latency_msec = INT64_MAX; + } else { + child_halo_enabled = _gf_true; + halo_max_latency_msec = _afr_get_halo_latency (this); + } + + if (event == GF_EVENT_CHILD_PING) { + /* Calculates the child latency and sets event + */ + LOCK (&priv->lock); + { + _afr_handle_ping_event (this, child_xlator, idx, + halo_max_latency_msec, &event, + &child_latency_msec, child_halo_enabled); + } + UNLOCK (&priv->lock); + } + if (event == GF_EVENT_TRANSLATOR_OP) { LOCK (&priv->lock); { @@ -4367,52 +5020,16 @@ afr_notify (xlator_t *this, int32_t event, propagate = 1; break; case GF_EVENT_CHILD_UP: - /* - * This only really counts if the child was never up - * (value = -1) or had been down (value = 0). See - * comment at GF_EVENT_CHILD_DOWN for a more detailed - * explanation. - */ - if (priv->child_up[idx] != 1) { - priv->event_generation++; - } - priv->child_up[idx] = 1; - - call_psh = 1; - up_children = __afr_get_up_children_count (priv); - if (up_children == 1) { - gf_msg (this->name, GF_LOG_INFO, 0, - AFR_MSG_SUBVOL_UP, - "Subvolume '%s' came back up; " - "going online.", ((xlator_t *)data)->name); - } else { - event = GF_EVENT_CHILD_MODIFIED; - } - - priv->last_event[idx] = event; - + _afr_handle_child_up_event (this, child_xlator, + idx, halo_max_latency_msec, &event, &call_psh, + &up_child, child_halo_enabled); break; case GF_EVENT_CHILD_DOWN: - if (priv->child_up[idx] == 1) { - priv->event_generation++; - } - priv->child_up[idx] = 0; - - for (i = 0; i < priv->child_count; i++) - if (priv->child_up[i] == 0) - down_children++; - if (down_children == priv->child_count) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_ALL_SUBVOLS_DOWN, - "All subvolumes are down. Going offline " - "until atleast one of them comes back up."); - } else { - event = GF_EVENT_SOME_CHILD_DOWN; - } - - priv->last_event[idx] = event; - + _afr_handle_child_down_event (this, child_xlator, idx, + child_latency_msec, halo_max_latency_msec, + &event, &call_psh, &up_child, + child_halo_enabled); break; case GF_EVENT_CHILD_CONNECTING: @@ -4439,7 +5056,6 @@ afr_notify (xlator_t *this, int32_t event, had come up, propagate CHILD_UP, but only this time */ event = GF_EVENT_CHILD_DOWN; - up_children = __afr_get_up_children_count (priv); for (i = 0; i < priv->child_count; i++) { if (priv->last_event[i] == GF_EVENT_CHILD_UP) { event = GF_EVENT_CHILD_UP; diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 7f7962013d7..c7d6261b110 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -46,7 +46,8 @@ enum gf_afr_mem_types_ { gf_afr_mt_spbc_timeout_t, gf_afr_mt_spb_status_t, gf_afr_mt_empty_brick_t, - gf_afr_mt_end + gf_afr_mt_child_latency_t, + gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 1b3b1ca0af1..9c12e433097 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -371,7 +371,7 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, { afr_private_t *priv = NULL; off_t off = 0; - size_t block = 128 * 1024; + size_t block = 0; int type = AFR_SELFHEAL_DATA_FULL; int ret = -1; call_frame_t *iter_frame = NULL; @@ -383,6 +383,8 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, healed_sinks[ARBITER_BRICK_INDEX] = 0; } + block = 128 * 1024 * priv->data_self_heal_window_size; + type = afr_data_self_heal_type_get (priv, healed_sinks, source, replies); diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index c6ac5ebfd1b..4ac1d32f58a 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -58,6 +58,7 @@ typedef struct { eh_t **statistics; uint32_t max_threads; uint32_t wait_qlength; + uint32_t halo_max_latency_msec; } afr_self_heald_t; diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 6f4783c9213..ae9b28c7fb4 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -176,6 +176,42 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("data-self-heal-algorithm", priv->data_self_heal_algorithm, options, str, out); + GF_OPTION_RECONF ("halo-enabled", + priv->halo_enabled, options, bool, + out); + + GF_OPTION_RECONF ("halo-failover-enabled", + priv->halo_failover_enabled, options, bool, + out); + + GF_OPTION_RECONF ("halo-shd-max-latency", + priv->shd.halo_max_latency_msec, options, uint32, + out); + + GF_OPTION_RECONF ("halo-nfsd-max-latency", + priv->nfsd.halo_max_latency_msec, options, uint32, + out); + + GF_OPTION_RECONF ("halo-max-latency", priv->halo_max_latency_msec, + options, uint32, out); + + GF_OPTION_RECONF ("halo-hybrid-mode", + priv->halo_hybrid_mode, options, bool, + out); + + GF_OPTION_RECONF ("halo-hybrid-read-max-latency", + priv->halo_hybrid_read_max_latency_msec, options, + uint32, out); + + GF_OPTION_RECONF ("halo-max-replicas", priv->halo_max_replicas, options, + uint32, out); + + GF_OPTION_RECONF ("halo-min-replicas", priv->halo_min_replicas, options, + uint32, out); + + GF_OPTION_RECONF ("halo-min-samples", priv->halo_min_samples, options, + uint32, out); + GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, @@ -396,6 +432,35 @@ init (xlator_t *this) GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); + GF_OPTION_INIT ("halo-hybrid-mode", + priv->halo_hybrid_mode, bool, out); + + GF_OPTION_INIT ("halo-hybrid-read-max-latency", + priv->halo_hybrid_read_max_latency_msec, uint32, + out); + + GF_OPTION_INIT ("halo-enabled", + priv->halo_enabled, bool, out); + + GF_OPTION_INIT ("halo-failover-enabled", + priv->halo_failover_enabled, bool, out); + + GF_OPTION_INIT ("halo-shd-max-latency", priv->shd.halo_max_latency_msec, + uint32, out); + GF_OPTION_INIT ("halo-max-latency", priv->halo_max_latency_msec, + uint32, out); + GF_OPTION_INIT ("halo-max-replicas", priv->halo_max_replicas, uint32, + out); + GF_OPTION_INIT ("halo-min-replicas", priv->halo_min_replicas, uint32, + out); + GF_OPTION_INIT ("halo-min-samples", priv->halo_min_samples, uint32, + out); + + GF_OPTION_INIT ("halo-nfsd-max-latency", + priv->nfsd.halo_max_latency_msec, uint32, out); + + GF_OPTION_INIT ("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out); + GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, @@ -445,17 +510,24 @@ init (xlator_t *this) priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, gf_afr_mt_char); - if (!priv->child_up) { + + priv->child_latency = GF_CALLOC (sizeof (*priv->child_latency), + child_count, + gf_afr_mt_child_latency_t); + + if (!priv->child_up || !priv->child_latency) { ret = -ENOMEM; goto out; } - for (i = 0; i < child_count; i++) + for (i = 0; i < child_count; i++) { + priv->child_latency[i] = 0.0; priv->child_up[i] = -1; /* start with unknown state. this initialization needed for afr_notify() to work reliably */ + } priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, gf_afr_mt_xlator_t); @@ -663,6 +735,85 @@ struct volume_options options[] = { "jobs that can perform parallel heals in the " "background." }, + { .key = {"halo-shd-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "99999", + .description = "Maximum latency for shd halo replication in msec." + }, + { .key = {"halo-enabled"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "False", + .description = "Enable Halo (geo) replication mode." + }, + { .key = {"halo-failover-enabled"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "False", + .description = "Enable x-halo failover: will allow failover " + "to bricks outside the client or daemons' halo " + "in an attempt to satisfy halo-min-replicas." + }, + { .key = {"halo-nfsd-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "5", + .description = "Maximum latency for nfsd halo replication in msec." + }, + { .key = {"halo-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "5", + .description = "Maximum latency for halo replication in msec." + }, + { .key = {"halo-hybrid-mode"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Enable hybrid sync mounts. When enabled, halo will " + "do write FOPs synchronously, and read FOPs will be " + "services in-region if the inode is clean/consistent." + "If no bricks can be found below " + "halo-hybrid-max-read-latency then the best 2 shall " + "be selected. This option can be used in " + "conjunction with all other halo options." + }, + { .key = {"halo-hybrid-read-max-latency"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "8", + .description = "Maximum latency hybrid mode will use to select " + "children for read FOPs. Don't tune this unless " + "you really know what you are doing (i.e. you've " + "read/understand the associated source code)." + }, + { .key = {"halo-max-replicas"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "99999", + .description = "The maximum number of halo replicas; replicas" + " beyond this value will be written asynchronously" + "via the SHD." + }, + { .key = {"halo-min-replicas"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "2", + .description = "The minimum number of halo replicas, before adding " + "out of region replicas." + }, + { .key = {"halo-min-samples"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 99999, + .default_value = "3", + .description = "The minimum number of halo latency samples, before " + "we start forming the halos." + }, { .key = {"heal-wait-queue-length"}, .type = GF_OPTION_TYPE_INT, .min = 0, @@ -802,6 +953,13 @@ struct volume_options options[] = { "translator is running as part of self-heal-daemon " "or not." }, + { .key = {"iam-nfs-daemon"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option differentiates if the replicate " + "translator is running as part of an NFS daemon " + "or not." + }, { .key = {"quorum-type"}, .type = GF_OPTION_TYPE_STR, .value = { "none", "auto", "fixed"}, @@ -866,7 +1024,7 @@ struct volume_options options[] = { }, { .key = {"heal-timeout"}, .type = GF_OPTION_TYPE_INT, - .min = 60, + .min = 5, .max = INT_MAX, .default_value = "600", .description = "time interval for checking the need to self-heal " diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 70c3e349743..aa19f1eeb37 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -30,6 +30,9 @@ #define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty" #define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty) +#define AFR_CHILD_DOWN_LATENCY INT64_MAX /* Latency for down children */ +#define AFR_HALO_HYBRID_CHILD_LIMIT 2 /* Examine bricks <= 10 msec */ +#define AFR_HALO_HYBRID_LATENCY_MSEC 10.0 /* Examine bricks <= 10 msec */ #define AFR_LOCKEE_COUNT_MAX 3 #define AFR_DOM_COUNT_MAX 3 #define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ @@ -72,6 +75,17 @@ typedef enum { AFR_FAV_CHILD_POLICY_MAX, } afr_favorite_child_policy; +struct afr_nfsd { + gf_boolean_t iamnfsd; + uint32_t halo_max_latency_msec; +}; + +struct afr_child { + uint32_t idx; + int64_t latency; + unsigned char child_up; +}; + typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ @@ -83,6 +97,7 @@ typedef struct _afr_private { inode_t *root_inode; unsigned char *child_up; + int64_t *child_latency; unsigned char *local; char **pending_key; @@ -153,8 +168,19 @@ typedef struct _afr_private { gf_boolean_t ensure_durability; char *sh_domain; char *afr_dirty; - - afr_self_heald_t shd; + gf_boolean_t halo_enabled; + + /* Halo geo-replication tunables */ + gf_boolean_t halo_failover_enabled; + gf_boolean_t halo_hybrid_mode; + uint32_t halo_hybrid_read_max_latency_msec; + uint32_t halo_max_latency_msec; + uint32_t halo_max_replicas; + uint32_t halo_min_replicas; + uint32_t halo_min_samples; + + afr_self_heald_t shd; + struct afr_nfsd nfsd; gf_boolean_t consistent_metadata; uint64_t spb_choice_timeout; diff --git a/xlators/cluster/aha/Makefile.am b/xlators/cluster/aha/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/cluster/aha/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/aha/src/Makefile.am b/xlators/cluster/aha/src/Makefile.am new file mode 100644 index 00000000000..006db127d28 --- /dev/null +++ b/xlators/cluster/aha/src/Makefile.am @@ -0,0 +1,18 @@ + +xlator_LTLIBRARIES = aha.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +aha_la_LDFLAGS = -module -avoid-version + +aha_la_SOURCES = aha.c aha-fops.c aha-helpers.c aha-retry.c +aha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = aha-mem-types.h aha.h aha-helpers.h aha.h aha-retry.h aha-fops.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/cluster/aha/src/aha-fops.c b/xlators/cluster/aha/src/aha-fops.c new file mode 100644 index 00000000000..3b2ca641de2 --- /dev/null +++ b/xlators/cluster/aha/src/aha-fops.c @@ -0,0 +1,952 @@ +#include "aha-fops.h" + +static void +__save_fop (struct aha_fop *fop, struct aha_conf *conf) +{ + list_add_tail (&fop->list, &conf->failed); +} + +void +save_fop (struct aha_fop *fop, struct aha_conf *conf) +{ + LOCK (&conf->lock); + { + __save_fop (fop, conf); + } + UNLOCK (&conf->lock); +} + +#define AHA_HANDLE_FOP(frame, type, cbk, obj, fn, args ...) \ + do { \ + struct aha_fop *fop = aha_fop_new (); \ + if (!fop) { \ + gf_log (GF_AHA, GF_LOG_CRITICAL, \ + "Allocation failed, terminating " \ + "to prevent a hung mount."); \ + assert (0); \ + } \ + fop->stub = fop_##type##_stub (frame, aha_##type, \ + args); \ + fop->frame = frame; \ + frame->local = fop; \ + STACK_WIND (frame, cbk, obj, fn, args); \ + } while (0) \ + +/* + * AHA_HANDLE_FOP_CBK + * + * 1) If the error returned is ENOTCONN *and* the timer that waits + * for the server to come back has not expired, store the fop to retry later. + * 2) If the timer waiting for the server has expired, just unwind. + * 3) If the error returned is something other than ENOTCONN, just unwind. + * + */ +#define AHA_HANDLE_FOP_CBK(type, frame, args ...) \ + do { \ + struct aha_conf *conf = frame->this->private; \ + struct aha_fop *fop = frame->local; \ + if (op_ret != 0 && op_errno == ENOTCONN && \ + !aha_is_timer_expired (conf)) { \ + gf_log (GF_AHA, GF_LOG_WARNING, \ + "Got ENOTCONN from client, storing " \ + "to retry later!"); \ + save_fop (fop, conf); \ + } else { \ + AHA_DESTROY_LOCAL (frame); \ + STACK_UNWIND_STRICT (type, frame, args); \ + } \ + } while (0) \ + +int +aha_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + AHA_HANDLE_FOP_CBK (lookup, frame, op_ret, op_errno, inode, + buf, xdata, postparent); + return 0; +} + + +int +aha_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, lookup, aha_lookup_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, + loc, xdata); + return 0; +} + + +int +aha_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +aha_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, stat, aha_stat_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->stat, + loc, xdata); + return 0; +} + + +int +aha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (setattr, frame, op_ret, op_errno, preop, + postop, xdata); + return 0; +} + + +int +aha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, setattr, aha_setattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; +} + + +int +aha_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fsetattr, frame, op_ret, op_errno, preop, + postop, xdata); + return 0; +} + +int +aha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fsetattr, aha_fsetattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsetattr, + fd, stbuf, valid, xdata); + return 0; +} + + +int +aha_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (truncate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; +} + + +int +aha_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, truncate, aha_truncate_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->truncate, + loc, offset, xdata); + return 0; +} + + +int +aha_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (ftruncate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; +} + + +int +aha_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, ftruncate, aha_ftruncate_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->ftruncate, + fd, offset, xdata); + return 0; +} + + +int +aha_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (access, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_access (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t mask, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, access, aha_access_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->access, + loc, mask, xdata); + return 0; +} + + +int +aha_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *path, struct iatt *sbuf, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (readlink, frame, op_ret, op_errno, + path, sbuf, xdata); + return 0; +} + + +int +aha_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, + size_t size, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, readlink, aha_readlink_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readlink, + loc, size, xdata); + return 0; +} + + +int +aha_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (mknod, frame, op_ret, op_errno, + inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +aha_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, + mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, mknod, aha_mknod_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->mknod, + loc, mode, rdev, umask, xdata); + return 0; +} + + +int +aha_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (mkdir, frame, op_ret, op_errno, + inode, buf, + preparent, postparent, xdata); + return 0; +} + +int +aha_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, mkdir, aha_mkdir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->mkdir, + loc, mode, umask, xdata); + return 0; +} + + +int +aha_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (unlink, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + + +int +aha_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, unlink, aha_unlink_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->unlink, + loc, xflag, xdata); + return 0; +} + + +int +aha_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (rmdir, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + + +int +aha_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, rmdir, aha_rmdir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->rmdir, + loc, flags, xdata); + return 0; +} + + +int +aha_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (symlink, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +aha_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, symlink, aha_symlink_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->symlink, + linkpath, loc, umask, xdata); + return 0; +} + + +int +aha_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (rename, frame, op_ret, op_errno, buf, + preoldparent, postoldparent, + prenewparent, postnewparent, xdata); + return 0; +} + + +int +aha_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, rename, aha_rename_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->rename, + oldloc, newloc, xdata); + return 0; +} + + +int +aha_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (link, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +aha_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, link, aha_link_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->link, + oldloc, newloc, xdata); + return 0; +} + + +int +aha_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (create, frame, op_ret, op_errno, fd, inode, buf, + preparent, postparent, xdata); + return 0; +} + + +int +aha_create (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flags, mode_t mode, mode_t umask, fd_t *fd, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, create, aha_create_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; +} + + +int +aha_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (open, frame, op_ret, op_errno, fd, xdata); + return 0; +} + + +int +aha_open (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flags, fd_t *fd, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, open, aha_open_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->open, + loc, flags, fd, xdata); + return 0; +} + +int +aha_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (readv, frame, op_ret, op_errno, + vector, count, stbuf, iobref, xdata); + return 0; +} + +int +aha_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, uint32_t flags, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, readv, aha_readv_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; +} + + +int +aha_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (writev, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; +} + +int +aha_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, + off_t off, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, writev, aha_writev_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, + fd, vector, count, off, flags, iobref, xdata); + return 0; +} + + +int +aha_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (flush, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, flush, aha_flush_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->flush, + fd, xdata); + return 0; +} + + +int +aha_fsync_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fsync, frame, op_ret, op_errno, + prebuf, postbuf, xdata); + return 0; +} + + +int +aha_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fsync, aha_fsync_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsync, + fd, flags, xdata); + return 0; +} + + +int +aha_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fstat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + + +int +aha_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fstat, aha_fstat_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fstat, + fd, xdata); + return 0; +} + + +int +aha_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (opendir, frame, op_ret, op_errno, fd, xdata); + return 0; +} + + +int +aha_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, opendir, aha_opendir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->opendir, + loc, fd, xdata); + return 0; +} + +int +aha_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fsyncdir, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t flags, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fsyncdir, aha_fsyncdir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsyncdir, + fd, flags, xdata); + return 0; +} + + +int +aha_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct statvfs *buf, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (statfs, frame, op_ret, op_errno, buf, xdata); + return 0; +} + + +int +aha_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, statfs, aha_statfs_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->statfs, + loc, xdata); + return 0; +} + + + +int +aha_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (setxattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *dict, int32_t flags, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, setxattr, aha_setxattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setxattr, + loc, dict, flags, xdata); + return 0; +} + + +int +aha_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +aha_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, getxattr, aha_getxattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->getxattr, + loc, name, xdata); + return 0; +} + +int +aha_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fsetxattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int32_t flags, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fsetxattr, aha_fsetxattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsetxattr, + fd, dict, flags, xdata); + return 0; +} + + +int +aha_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fgetxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +aha_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fgetxattr, aha_fgetxattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fgetxattr, + fd, name, xdata); + return 0; +} + + +int +aha_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (xattrop, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +aha_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, xattrop, aha_xattrop_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->xattrop, + loc, flags, dict, xdata); + return 0; +} + + +int +aha_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fxattrop, frame, op_ret, op_errno, dict, xdata); + return 0; +} + + +int +aha_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fxattrop, aha_fxattrop_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fxattrop, + fd, flags, dict, xdata); + return 0; +} + + +int +aha_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (removexattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, removexattr, aha_removexattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->removexattr, + loc, name, xdata); + return 0; +} + +int +aha_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fremovexattr, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fremovexattr, aha_fremovexattr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fremovexattr, + fd, name, xdata); + return 0; +} + + +int +aha_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (lk, frame, op_ret, op_errno, lock, xdata); + return 0; +} + + +int +aha_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, lk, aha_lk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lk, + fd, cmd, lock, xdata); + return 0; +} + + +int +aha_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (inodelk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_inodelk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, inodelk, aha_inodelk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->inodelk, + volume, loc, cmd, lock, xdata); + return 0; +} + + +int +aha_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (finodelk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_finodelk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, finodelk, aha_finodelk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->finodelk, + volume, fd, cmd, lock, xdata); + return 0; +} + + +int +aha_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (entrylk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_entrylk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, entrylk, aha_entrylk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->entrylk, + volume, loc, basename, cmd, type, xdata); + return 0; +} + + +int +aha_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (fentrylk, frame, op_ret, op_errno, xdata); + return 0; +} + + +int +aha_fentrylk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, fentrylk, aha_fentrylk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fentrylk, + volume, fd, basename, cmd, type, xdata); + return 0; +} + +int +aha_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (readdir, frame, op_ret, op_errno, entries, xdata); + return 0; +} + + +int +aha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t off, dict_t *xdata) +{ + AHA_HANDLE_FOP (frame, readdir, aha_readdir_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readdir, + fd, size, off, xdata); + return 0; +} + + +int +aha_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + AHA_HANDLE_FOP_CBK (readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} + + +int +aha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict) +{ + AHA_HANDLE_FOP (frame, readdirp, aha_readdirp_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readdirp, + fd, size, off, dict); + return 0; +} diff --git a/xlators/cluster/aha/src/aha-fops.h b/xlators/cluster/aha/src/aha-fops.h new file mode 100644 index 00000000000..b1fb9d38a80 --- /dev/null +++ b/xlators/cluster/aha/src/aha-fops.h @@ -0,0 +1,360 @@ +#ifndef _AHA_FOPS_H +#define _AHA_FOPS_H + +#include "aha.h" +#include "aha-helpers.h" + +/* FOP functions */ +int +aha_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +aha_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +aha_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata); + +int +aha_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata); + +int +aha_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata); + +int +aha_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata); + +int +aha_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata); + +int +aha_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata); + +int +aha_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata); + +int +aha_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata); + +int +aha_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata); + +int +aha_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata); + +int +aha_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata); + +int +aha_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int +aha_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); + +int +aha_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); + +int +aha_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata); + +int +aha_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, uint32_t flags, + dict_t *xdata); + +int +aha_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t off, uint32_t flags, + struct iobref *iobref, dict_t *xdata); + +int +aha_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int +aha_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t flags, dict_t *xdata); + +int +aha_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int +aha_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata); + +int +aha_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata); + +int +aha_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +aha_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata); + +int +aha_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); + +int +aha_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + dict_t *dict, int32_t flags, dict_t *xdata); + +int +aha_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata); + +int +aha_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); + +int +aha_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); + +int +aha_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); + +int +aha_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata); + +int +aha_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata); + +int +aha_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, int32_t cmd, struct gf_flock *lock, + dict_t *xdata); + +int +aha_finodelk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, int32_t cmd, + struct gf_flock *lock, dict_t *xdata); + +int +aha_entrylk (call_frame_t *frame, xlator_t *this, + const char *volume, loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); + +int +aha_fentrylk (call_frame_t *frame, xlator_t *this, + const char *volume, fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type, dict_t *xdata); +int +aha_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata); + +int +aha_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict); + +/* Callback functions */ + +int +aha_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent); + +int +aha_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata); + +int +aha_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata); + +int +aha_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata); + +int +aha_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata); + + +int +aha_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata); + + +int +aha_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + + +int +aha_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *path, struct iatt *sbuf, dict_t *xdata); + + +int +aha_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); + + +int +aha_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); + +int +aha_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); + +int +aha_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); +int +aha_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); +int +aha_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata); + +int +aha_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata); +int +aha_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); +int +aha_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata); +int +aha_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, dict_t *xdata); + +int +aha_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata); +int +aha_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_fsync_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata); +int +aha_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata); + +int +aha_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata); +int +aha_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct statvfs *buf, + dict_t *xdata); +int +aha_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int +aha_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int +aha_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int +aha_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata); + +int +aha_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata); + +int +aha_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); + +int +aha_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); +int +aha_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata); +int +aha_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata); + +#endif /* _AHA_FOPS_H */ diff --git a/xlators/cluster/aha/src/aha-helpers.c b/xlators/cluster/aha/src/aha-helpers.c new file mode 100644 index 00000000000..e3b713688d3 --- /dev/null +++ b/xlators/cluster/aha/src/aha-helpers.c @@ -0,0 +1,46 @@ +#include "aha-helpers.h" + +struct aha_conf *aha_conf_new () +{ + struct aha_conf *conf = NULL; + + conf = GF_CALLOC (1, sizeof (*conf), gf_aha_mt_conf); + if (!conf) + goto err; + + INIT_LIST_HEAD (&conf->failed); + + LOCK_INIT (&conf->lock); +err: + return conf; +} + +void aha_conf_destroy (struct aha_conf *conf) +{ + LOCK_DESTROY (&conf->lock); + GF_FREE (conf); +} + +struct aha_fop *aha_fop_new () +{ + struct aha_fop *fop = NULL; + + fop = GF_CALLOC (1, sizeof (*fop), gf_aha_mt_fop); + if (!fop) + goto err; + + INIT_LIST_HEAD (&fop->list); + +err: + return fop; +} + +void aha_fop_destroy (struct aha_fop *fop) +{ + if (!fop) + return; + + call_stub_destroy (fop->stub); + fop->stub = NULL; + GF_FREE (fop); +} diff --git a/xlators/cluster/aha/src/aha-helpers.h b/xlators/cluster/aha/src/aha-helpers.h new file mode 100644 index 00000000000..d9cf9b3295d --- /dev/null +++ b/xlators/cluster/aha/src/aha-helpers.h @@ -0,0 +1,23 @@ +#ifndef _AHA_HELPERS_H +#define _AHA_HELPERS_H + +#include "aha.h" + +#define GF_AHA "aha" + +struct aha_conf *aha_conf_new (); + +void aha_conf_destroy (struct aha_conf *conf); + +struct aha_fop *aha_fop_new (); + +void aha_fop_destroy (struct aha_fop *fop); + +#define AHA_DESTROY_LOCAL(frame) \ + do { \ + struct aha_fop *fop = frame->local; \ + aha_fop_destroy (fop); \ + frame->local = NULL; \ + } while (0) \ + +#endif /* _AHA_HELPERS_H */ diff --git a/xlators/cluster/aha/src/aha-mem-types.h b/xlators/cluster/aha/src/aha-mem-types.h new file mode 100644 index 00000000000..117dda27e8b --- /dev/null +++ b/xlators/cluster/aha/src/aha-mem-types.h @@ -0,0 +1,22 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __AHA_MEM_TYPES_H__ +#define __AHA_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_aha_mem_types_ { + gf_aha_mt_begin_t = gf_common_mt_end + 1, + gf_aha_mt_conf, + gf_aha_mt_fop, + gf_aha_mt_end +}; +#endif diff --git a/xlators/cluster/aha/src/aha-retry.c b/xlators/cluster/aha/src/aha-retry.c new file mode 100644 index 00000000000..8810f913f42 --- /dev/null +++ b/xlators/cluster/aha/src/aha-retry.c @@ -0,0 +1,524 @@ +#include "aha.h" +#include "aha-helpers.h" +#include "aha-retry.h" +#include "aha-fops.h" + +/* + * AHA_RETRY_FOP: + * + * - We STACK_WIND the fop using the arguments in the call_stub. + * We use STACK_WIND because we need a *new* frame, since we already + * exhausted the existing frame with the original STACK_WIND. + * + * - After STACK_WIND completes, we can destroy this frame's local (which + * should be struct aha_fop *). The frame itself will get destroyed higher in + * the xlator graph, since its still part of the call stack. + */ +#define AHA_RETRY_FOP(fop, type, args ...) \ + do { \ + call_stub_t *stub = fop->stub; \ + call_frame_t *frame = fop->frame; \ + xlator_t *this = frame->this; \ + STACK_WIND (frame, aha_##type##_cbk, this, \ + this->fops->type, args); \ + AHA_DESTROY_LOCAL (frame); \ + } while (0) \ + +#define AHA_UNWIND_FOP(fop, type) \ + do { \ + call_frame_t *frame = fop->frame; \ + AHA_DESTROY_LOCAL (frame); \ + default_##type##_failure_cbk (frame, ETIMEDOUT); \ + } while (0) \ + +void +__aha_retry_force_unwind_fops (struct aha_conf *conf) +{ + struct aha_fop *fop = NULL; + struct aha_fop *tmp = NULL; + size_t ndrained = 0; + + /* + * Drain the queue. After we finish the loop, the list + * must be empty. + */ + list_for_each_entry_safe (fop, tmp, &conf->failed, list) { + list_del (&fop->list); + aha_force_unwind_fop (fop); + ndrained++; + } + + gf_log (GF_AHA, GF_LOG_WARNING, + "Force-unwound %"GF_PRI_SIZET" fops!", ndrained); + + assert (list_empty (&conf->failed)); +} + +void +aha_force_unwind_fops (struct aha_conf *conf) +{ + LOCK (&conf->lock); + { + __aha_retry_force_unwind_fops (conf); + } + UNLOCK (&conf->lock); +} + +void +__aha_retry_failed_fops (struct aha_conf *conf) +{ + struct aha_fop *fop = NULL; + struct aha_fop *tmp = NULL; + size_t ndrained = 0; + + /* + * Skip if the child is not up + */ + if (!conf->child_up) { + gf_log (GF_AHA, GF_LOG_WARNING, + "Waiting for child to come up before retrying."); + return; + } + + /* + * Skip if the the queue is empty. + */ + if (list_empty (&conf->failed)) { + gf_log (GF_AHA, GF_LOG_WARNING, "No FOPs to retry."); + } + + /* + * Drain the queue. After we finish the loop, the list + * must be empty. + */ + list_for_each_entry_safe (fop, tmp, &conf->failed, list) { + list_del (&fop->list); + aha_retry_fop (fop); + ndrained++; + } + + gf_log (GF_AHA, GF_LOG_WARNING, + "Drained %"GF_PRI_SIZET" fops!", ndrained); + + assert (list_empty (&conf->failed)); +} + + +void +aha_retry_failed_fops (struct aha_conf *conf) +{ + LOCK (&conf->lock); + { + __aha_retry_failed_fops (conf); + } + UNLOCK (&conf->lock); +} + +void aha_retry_fop (struct aha_fop *fop) +{ + call_stub_t *stub = fop->stub; + + switch (stub->fop) { + case GF_FOP_OPEN: + AHA_RETRY_FOP (fop, open, &stub->args.loc, stub->args.flags, + stub->args.fd, stub->args.xdata); + break; + + case GF_FOP_CREATE: + AHA_RETRY_FOP (fop, create, &stub->args.loc, stub->args.flags, + stub->args.mode, stub->args.umask, + stub->args.fd, + stub->args.xdata); + break; + + case GF_FOP_STAT: + AHA_RETRY_FOP (fop, stat, &stub->args.loc, stub->args.xdata); + break; + + case GF_FOP_READLINK: + AHA_RETRY_FOP (fop, readlink, &stub->args.loc, + stub->args.size, stub->args.xdata); + break; + + case GF_FOP_MKNOD: + AHA_RETRY_FOP (fop, mknod, &stub->args.loc, stub->args.mode, + stub->args.rdev, stub->args.umask, + stub->args.xdata); + break; + + case GF_FOP_MKDIR: + AHA_RETRY_FOP (fop, mkdir, &stub->args.loc, stub->args.mode, + stub->args.umask, stub->args.xdata); + break; + + case GF_FOP_UNLINK: + AHA_RETRY_FOP (fop, unlink, &stub->args.loc, stub->args.xflag, + stub->args.xdata); + break; + + case GF_FOP_RMDIR: + AHA_RETRY_FOP (fop, rmdir, &stub->args.loc, + stub->args.flags, stub->args.xdata); + break; + + case GF_FOP_SYMLINK: + AHA_RETRY_FOP (fop, symlink, stub->args.linkname, + &stub->args.loc, stub->args.umask, + stub->args.xdata); + break; + + case GF_FOP_RENAME: + AHA_RETRY_FOP (fop, rename, &stub->args.loc, + &stub->args.loc2, stub->args.xdata); + break; + + case GF_FOP_LINK: + AHA_RETRY_FOP (fop, link, &stub->args.loc, + &stub->args.loc2, stub->args.xdata); + break; + + case GF_FOP_TRUNCATE: + AHA_RETRY_FOP (fop, truncate, &stub->args.loc, + stub->args.offset, stub->args.xdata); + break; + + case GF_FOP_READ: + AHA_RETRY_FOP (fop, readv, stub->args.fd, stub->args.size, + stub->args.offset, stub->args.flags, + stub->args.xdata); + break; + + case GF_FOP_WRITE: + AHA_RETRY_FOP (fop, writev, stub->args.fd, stub->args.vector, + stub->args.count, stub->args.offset, + stub->args.flags, stub->args.iobref, + stub->args.xdata); + break; + + case GF_FOP_STATFS: + AHA_RETRY_FOP (fop, statfs, &stub->args.loc, stub->args.xdata); + break; + + case GF_FOP_FLUSH: + AHA_RETRY_FOP (fop, flush, stub->args.fd, stub->args.xdata); + break; + + case GF_FOP_FSYNC: + AHA_RETRY_FOP (fop, fsync, stub->args.fd, stub->args.datasync, + stub->args.xdata); + break; + + case GF_FOP_SETXATTR: + AHA_RETRY_FOP (fop, setxattr, &stub->args.loc, stub->args.xattr, + stub->args.flags, stub->args.xdata); + break; + + case GF_FOP_GETXATTR: + AHA_RETRY_FOP (fop, getxattr, &stub->args.loc, + stub->args.name, stub->args.xdata); + break; + + case GF_FOP_FSETXATTR: + AHA_RETRY_FOP (fop, fsetxattr, stub->args.fd, + stub->args.xattr, stub->args.flags, + stub->args.xdata); + break; + + case GF_FOP_FGETXATTR: + AHA_RETRY_FOP (fop, fgetxattr, stub->args.fd, + stub->args.name, stub->args.xdata); + break; + + case GF_FOP_REMOVEXATTR: + AHA_RETRY_FOP (fop, removexattr, &stub->args.loc, + stub->args.name, stub->args.xdata); + break; + + case GF_FOP_FREMOVEXATTR: + AHA_RETRY_FOP (fop, fremovexattr, stub->args.fd, + stub->args.name, stub->args.xdata); + break; + + case GF_FOP_OPENDIR: + AHA_RETRY_FOP (fop, opendir, &stub->args.loc, + stub->args.fd, stub->args.xdata); + break; + + case GF_FOP_FSYNCDIR: + AHA_RETRY_FOP (fop, fsyncdir, stub->args.fd, + stub->args.datasync, stub->args.xdata); + break; + + case GF_FOP_ACCESS: + AHA_RETRY_FOP (fop, access, &stub->args.loc, + stub->args.mask, stub->args.xdata); + break; + + case GF_FOP_FTRUNCATE: + AHA_RETRY_FOP (fop, ftruncate, stub->args.fd, + stub->args.offset, stub->args.xdata); + break; + + case GF_FOP_FSTAT: + AHA_RETRY_FOP (fop, fstat, stub->args.fd, stub->args.xdata); + break; + + case GF_FOP_LK: + AHA_RETRY_FOP (fop, lk, stub->args.fd, stub->args.cmd, + &stub->args.lock, stub->args.xdata); + break; + + case GF_FOP_INODELK: + AHA_RETRY_FOP (fop, inodelk, stub->args.volume, + &stub->args.loc, stub->args.cmd, + &stub->args.lock, stub->args.xdata); + break; + + case GF_FOP_FINODELK: + AHA_RETRY_FOP (fop, finodelk, stub->args.volume, + stub->args.fd, stub->args.cmd, + &stub->args.lock, stub->args.xdata); + break; + + case GF_FOP_ENTRYLK: + AHA_RETRY_FOP (fop, entrylk, stub->args.volume, &stub->args.loc, + stub->args.name, stub->args.entrylkcmd, + stub->args.entrylktype, stub->args.xdata); + break; + + case GF_FOP_FENTRYLK: + AHA_RETRY_FOP (fop, fentrylk, stub->args.volume, stub->args.fd, + stub->args.name, stub->args.entrylkcmd, + stub->args.entrylktype, stub->args.xdata); + break; + + case GF_FOP_LOOKUP: + AHA_RETRY_FOP (fop, lookup, &stub->args.loc, stub->args.xdata); + break; + + case GF_FOP_READDIR: + AHA_RETRY_FOP (fop, readdir, stub->args.fd, stub->args.size, + stub->args.offset, stub->args.xdata); + break; + + case GF_FOP_READDIRP: + AHA_RETRY_FOP (fop, readdirp, stub->args.fd, stub->args.size, + stub->args.offset, stub->args.xdata); + break; + + case GF_FOP_XATTROP: + AHA_RETRY_FOP (fop, xattrop, &stub->args.loc, stub->args.optype, + stub->args.xattr, stub->args.xdata); + break; + + case GF_FOP_FXATTROP: + AHA_RETRY_FOP (fop, fxattrop, stub->args.fd, stub->args.optype, + stub->args.xattr, stub->args.xdata); + break; + + case GF_FOP_SETATTR: + AHA_RETRY_FOP (fop, setattr, &stub->args.loc, &stub->args.stat, + stub->args.valid, stub->args.xdata); + break; + + case GF_FOP_FSETATTR: + AHA_RETRY_FOP (fop, fsetattr, stub->args.fd, &stub->args.stat, + stub->args.valid, stub->args.xdata); + break; + + default: + /* Some fops are not implemented yet: + * + * GF_FOP_NULL + * GF_FOP_RCHECKSUM + * GF_FOP_FORGET + * GF_FOP_RELEASE + * GF_FOP_RELEASEDIR + * GF_FOP_GETSPEC + * GF_FOP_FALLOCATE + * GF_FOP_DISCARD + * GF_FOP_ZEROFILL + * GF_FOP_MAXVALUE + * + */ + gf_log (GF_AHA, GF_LOG_CRITICAL, "Got unexpected FOP %s", + gf_fop_list[stub->fop]); + assert (0); + break; + } +} + +void +aha_force_unwind_fop (struct aha_fop *fop) +{ + call_stub_t *stub = fop->stub; + + switch (stub->fop) { + case GF_FOP_OPEN: + AHA_UNWIND_FOP (fop, open); + break; + + case GF_FOP_CREATE: + AHA_UNWIND_FOP (fop, create); + break; + + case GF_FOP_STAT: + AHA_UNWIND_FOP (fop, stat); + break; + + case GF_FOP_READLINK: + AHA_UNWIND_FOP (fop, readlink); + break; + + case GF_FOP_MKNOD: + AHA_UNWIND_FOP (fop, mknod); + break; + + case GF_FOP_MKDIR: + AHA_UNWIND_FOP (fop, mkdir); + break; + + case GF_FOP_UNLINK: + AHA_UNWIND_FOP (fop, unlink); + break; + + case GF_FOP_RMDIR: + AHA_UNWIND_FOP (fop, rmdir); + break; + + case GF_FOP_SYMLINK: + AHA_UNWIND_FOP (fop, symlink); + break; + + case GF_FOP_RENAME: + AHA_UNWIND_FOP (fop, rename); + break; + + case GF_FOP_LINK: + AHA_UNWIND_FOP (fop, link); + break; + + case GF_FOP_TRUNCATE: + AHA_UNWIND_FOP (fop, truncate); + break; + + case GF_FOP_READ: + AHA_UNWIND_FOP (fop, readv); + break; + + case GF_FOP_WRITE: + AHA_UNWIND_FOP (fop, writev); + break; + + case GF_FOP_STATFS: + AHA_UNWIND_FOP (fop, statfs); + break; + + case GF_FOP_FLUSH: + AHA_UNWIND_FOP (fop, flush); + break; + + case GF_FOP_FSYNC: + AHA_UNWIND_FOP (fop, fsync); + break; + + case GF_FOP_SETXATTR: + AHA_UNWIND_FOP (fop, setxattr); + break; + + case GF_FOP_GETXATTR: + AHA_UNWIND_FOP (fop, getxattr); + break; + + case GF_FOP_FSETXATTR: + AHA_UNWIND_FOP (fop, fsetxattr); + break; + + case GF_FOP_FGETXATTR: + AHA_UNWIND_FOP (fop, fgetxattr); + break; + + case GF_FOP_REMOVEXATTR: + AHA_UNWIND_FOP (fop, removexattr); + break; + + case GF_FOP_FREMOVEXATTR: + AHA_UNWIND_FOP (fop, fremovexattr); + break; + + case GF_FOP_OPENDIR: + AHA_UNWIND_FOP (fop, opendir); + break; + + case GF_FOP_FSYNCDIR: + AHA_UNWIND_FOP (fop, fsyncdir); + break; + + case GF_FOP_ACCESS: + AHA_UNWIND_FOP (fop, access); + break; + + case GF_FOP_FTRUNCATE: + AHA_UNWIND_FOP (fop, ftruncate); + break; + + case GF_FOP_FSTAT: + AHA_UNWIND_FOP (fop, fstat); + break; + + case GF_FOP_LK: + AHA_UNWIND_FOP (fop, lk); + break; + + case GF_FOP_INODELK: + AHA_UNWIND_FOP (fop, inodelk); + break; + + case GF_FOP_FINODELK: + AHA_UNWIND_FOP (fop, finodelk); + break; + + case GF_FOP_ENTRYLK: + AHA_UNWIND_FOP (fop, entrylk); + break; + + case GF_FOP_FENTRYLK: + AHA_UNWIND_FOP (fop, fentrylk); + break; + + case GF_FOP_LOOKUP: + AHA_UNWIND_FOP (fop, lookup); + break; + + case GF_FOP_READDIR: + AHA_UNWIND_FOP (fop, readdir); + break; + + case GF_FOP_READDIRP: + AHA_UNWIND_FOP (fop, readdirp); + break; + + case GF_FOP_XATTROP: + AHA_UNWIND_FOP (fop, xattrop); + break; + + case GF_FOP_FXATTROP: + AHA_UNWIND_FOP (fop, fxattrop); + break; + + case GF_FOP_SETATTR: + AHA_UNWIND_FOP (fop, setattr); + break; + + case GF_FOP_FSETATTR: + AHA_UNWIND_FOP (fop, fsetattr); + break; + + default: + /* Some fops are not implemented yet, + * and this would never happen cause we wouldn't + * queue them (see the assert statement in aha_retry_fop()) + */ + break; + } +} diff --git a/xlators/cluster/aha/src/aha-retry.h b/xlators/cluster/aha/src/aha-retry.h new file mode 100644 index 00000000000..5c8f56bca97 --- /dev/null +++ b/xlators/cluster/aha/src/aha-retry.h @@ -0,0 +1,12 @@ +#ifndef _AHA_RETRY_H +#define _AHA_RETRY_H + +void aha_retry_failed_fops (struct aha_conf *conf); + +void aha_retry_fop (struct aha_fop *fop); + +void aha_force_unwind_fops (struct aha_conf *conf); + +void aha_force_unwind_fop (struct aha_fop *fop); + +#endif /* _AHA_RETRY_H */ diff --git a/xlators/cluster/aha/src/aha.c b/xlators/cluster/aha/src/aha.c new file mode 100644 index 00000000000..2135e47f37f --- /dev/null +++ b/xlators/cluster/aha/src/aha.c @@ -0,0 +1,345 @@ +#include "aha-helpers.h" +#include "aha-retry.h" +#include "aha-fops.h" +#include "aha.h" + +#include "syncop.h" + + +int +retry_failed_fops_cbk (int ret, call_frame_t *frame, void *arg) +{ + /* Nothing to do here ... */ + return 0; +} + +int +retry_failed_fops (void *arg) +{ + xlator_t *this = NULL; + + struct aha_conf *conf = NULL; + + this = arg; + conf = this->private; + + aha_retry_failed_fops (conf); + + return 0; +} + +void +dispatch_fop_queue_drain (xlator_t *this) +{ + struct syncenv *env = NULL; + int ret = 0; + + env = this->ctx->env; + + ret = synctask_new (env, retry_failed_fops, + retry_failed_fops_cbk, NULL, this); + if (ret != 0) { + gf_log (GF_AHA, GF_LOG_CRITICAL, + "Failed to dispatch synctask " + "to drain fop queue!"); + } +} + +inline void +__aha_set_timer_status (struct aha_conf *conf, gf_boolean_t expired) +{ + conf->timer_expired = expired; +} + +inline gf_boolean_t +__aha_is_timer_expired (struct aha_conf *conf) +{ + return conf->timer_expired; +} + +gf_boolean_t +aha_is_timer_expired (struct aha_conf *conf) +{ + gf_boolean_t expired = _gf_false; + + LOCK (&conf->lock); + { + expired = __aha_is_timer_expired (conf); + } + UNLOCK (&conf->lock); + + return expired; +} + +void +aha_child_down_timer_expired (void *data) +{ + struct aha_conf *conf = NULL; + + conf = data; + + gf_log (GF_AHA, GF_LOG_INFO, "Timer expired!"); + + LOCK (&conf->lock); + { + __aha_set_timer_status (conf, _gf_true); + } + UNLOCK (&conf->lock); + + aha_force_unwind_fops ((struct aha_conf *)data); +} + +void +__aha_start_timer (struct aha_conf *conf) +{ + struct timespec child_down_timeout = { + .tv_sec = conf->server_wait_timeout, + .tv_nsec = 0 + }; + + __aha_set_timer_status (conf, _gf_false); + + conf->timer = gf_timer_call_after (conf->this->ctx, child_down_timeout, + aha_child_down_timer_expired, conf); + if (!conf->timer) { + gf_log (GF_AHA, GF_LOG_CRITICAL, "Failed to start the timer!"); + } + + gf_log (GF_AHA, GF_LOG_INFO, + "Registered timer for %lu seconds.", + conf->server_wait_timeout); +} + +void +__aha_cancel_timer (struct aha_conf *conf) +{ + if (!conf->timer) + goto out; + + gf_timer_call_cancel (conf->this->ctx, conf->timer); + conf->timer = NULL; + gf_log (GF_AHA, GF_LOG_INFO, "Timer cancelled!"); +out: + return; +} + +void +__aha_update_child_status (struct aha_conf *conf, int status) +{ + conf->child_up = status; +} + +void +aha_handle_child_up (xlator_t *this) +{ + struct aha_conf *conf = this->private; + + LOCK (&conf->lock); + { + __aha_update_child_status ( + conf, AHA_CHILD_STATUS_UP); /* Mark the child as up */ + __aha_set_timer_status ( + conf, _gf_false); /* Timer is no longer expired */ + __aha_cancel_timer (conf); /* Cancel the timer */ + } + UNLOCK (&conf->lock); +} + +void +aha_handle_child_down (xlator_t *this) +{ + struct aha_conf *conf = this->private; + + LOCK (&conf->lock); + { + __aha_update_child_status (conf, AHA_CHILD_STATUS_DOWN); + __aha_set_timer_status (conf, _gf_true); + __aha_start_timer (conf); + } + UNLOCK (&conf->lock); +} + +int32_t +notify (xlator_t *this, int32_t event, void *data, ...) +{ + switch (event) { + case GF_EVENT_CHILD_DOWN: + gf_log (this->name, GF_LOG_WARNING, "Got child-down event!"); + aha_handle_child_down (this); + break; + case GF_EVENT_CHILD_UP: + gf_log (this->name, GF_LOG_WARNING, "Got child-up event!"); + aha_handle_child_up (this); + dispatch_fop_queue_drain (this); + break; + default: + break; + } + + default_notify (this, event, data); + + return 0; +} + +int32_t +aha_priv_dump (xlator_t *this) +{ + return 0; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_aha_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Memory accounting init failed!"); + return ret; + } + + return ret; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ + struct aha_conf *conf = NULL; + + conf = this->private; + + GF_OPTION_RECONF ("server-wait-timeout-seconds", + conf->server_wait_timeout, + options, size_uint64, err); + + return 0; +err: + return -1; +} + +int +aha_init_options (xlator_t *this) +{ + struct aha_conf *conf = NULL; + + conf = this->private; + + GF_OPTION_INIT ("server-wait-timeout-seconds", + conf->server_wait_timeout, + size_uint64, err); + + return 0; +err: + return -1; +} + + +int +init (xlator_t *this) +{ + int ret = 0; + struct aha_conf *conf = NULL; + + conf = aha_conf_new (); + if (!conf) { + ret = -(ENOMEM); + goto err; + } + + conf->this = this; + this->private = conf; + + aha_init_options (this); + + /* init() completed successfully */ + goto done; +err: + gf_log (GF_AHA, GF_LOG_ERROR, + "init() failed, please see " + "logs for details."); + + /* Free all allocated memory */ + aha_conf_destroy (conf); +done: + return ret; +} + +void +fini (xlator_t *this) +{ + struct aha_conf *conf = this->private; + + aha_conf_destroy (conf); + + this->private = NULL; +} + +struct xlator_dumpops dumpops = { + .priv = aha_priv_dump, +}; + +struct xlator_fops cbks; + +struct xlator_fops fops = { + .lookup = aha_lookup, + .stat = aha_stat, + .readlink = aha_readlink, + .mknod = aha_mknod, + .mkdir = aha_mkdir, + .unlink = aha_unlink, + .rmdir = aha_rmdir, + .symlink = aha_symlink, + .rename = aha_rename, + .link = aha_link, + .truncate = aha_truncate, + .create = aha_create, + .open = aha_open, + .readv = aha_readv, + .writev = aha_writev, + .statfs = aha_statfs, + .flush = aha_flush, + .fsync = aha_fsync, + .setxattr = aha_setxattr, + .getxattr = aha_getxattr, + .removexattr = aha_removexattr, + .fsetxattr = aha_fsetxattr, + .fgetxattr = aha_fgetxattr, + .fremovexattr = aha_fremovexattr, + .opendir = aha_opendir, + .readdir = aha_readdir, + .readdirp = aha_readdirp, + .fsyncdir = aha_fsyncdir, + .access = aha_access, + .ftruncate = aha_ftruncate, + .fstat = aha_fstat, + .lk = aha_lk, + .lookup_cbk = aha_lookup_cbk, + .xattrop = aha_xattrop, + .fxattrop = aha_fxattrop, + .inodelk = aha_inodelk, + .finodelk = aha_finodelk, + .entrylk = aha_entrylk, + .fentrylk = aha_fentrylk, + .setattr = aha_setattr, + .fsetattr = aha_fsetattr, +}; + +struct volume_options options[] = { + { .key = {"server-wait-timeout-seconds"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 10, + .max = 20 * 60, + .default_value = TOSTRING (120), + .description = "Specifies the number of seconds the " + "AHA translator will wait " + "for a CHILD_UP event before " + "force-unwinding the frames it has " + "currently stored for retry." + }, + { .key = {NULL} } +}; diff --git a/xlators/cluster/aha/src/aha.h b/xlators/cluster/aha/src/aha.h new file mode 100644 index 00000000000..3dbf3199776 --- /dev/null +++ b/xlators/cluster/aha/src/aha.h @@ -0,0 +1,46 @@ +#ifndef _AHA_H +#define _AHA_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "statedump.h" +#include "call-stub.h" +#include "defaults.h" +#include "list.h" +#include "timer.h" + +#include "aha-mem-types.h" + +/* new() and destroy() functions for all structs can be found in + * aha-helpers.c + */ +struct aha_conf { + xlator_t *this; + uint8_t child_up; + gf_lock_t lock; + struct list_head failed; + gf_timer_t *timer; + gf_boolean_t timer_expired; + uint64_t server_wait_timeout; +}; + +struct aha_fop { + call_stub_t *stub; /* Only used to store function arguments */ + call_frame_t *frame; /* Frame corresponding to this fop */ + uint64_t tries; + struct list_head list; +}; + +enum { + AHA_CHILD_STATUS_DOWN = 0, + AHA_CHILD_STATUS_UP = 1, + AHA_CHILD_STATUS_MAX +}; + +gf_boolean_t aha_is_timer_expired (struct aha_conf *conf); + +#endif diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index a9714b02b79..a97d03bb055 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -5559,6 +5559,7 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, { dht_local_t *local = NULL; xlator_t *avail_subvol = NULL; + int op_errno = 0; local = frame->local; @@ -5571,9 +5572,15 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, subvol, subvol->fops->mknod, loc, mode, rdev, umask, params); } else { - avail_subvol = dht_free_disk_available_subvol (this, subvol, local); - - if (avail_subvol != subvol) { + /* This will return NULL if all subvolumes are full + * and/or no subvolume needs the min_free_disk limit + */ + avail_subvol = dht_free_disk_available_subvol (this, subvol, + local); + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (avail_subvol != subvol) { local->params = dict_ref (params); local->rdev = rdev; local->mode = mode; @@ -5603,6 +5610,8 @@ dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, } out: return 0; +err: + return op_errno; } int32_t @@ -6242,8 +6251,12 @@ dht_mknod (call_frame_t *frame, xlator_t *this, } } - dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc, rdev, mode, - umask, params); + op_errno = dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc, + rdev, mode, umask, + params); + if (op_errno != 0) { + goto err; + } done: return 0; @@ -6738,6 +6751,7 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, { dht_local_t *local = NULL; xlator_t *avail_subvol = NULL; + int op_errno = 0; local = frame->local; @@ -6752,8 +6766,10 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, } else { avail_subvol = dht_free_disk_available_subvol (this, subvol, local); - - if (avail_subvol != subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (avail_subvol != subvol) { local->params = dict_ref (params); local->flags = flags; local->mode = mode; @@ -6780,6 +6796,10 @@ dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, } out: return 0; +err: + DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL, NULL); + return op_errno; } int @@ -6882,9 +6902,10 @@ dht_create_do (call_frame_t *frame) goto err; } - dht_create_wind_to_avail_subvol (frame, this, subvol, &local->loc, - local->flags, local->mode, - local->umask, local->fd, local->params); + dht_create_wind_to_avail_subvol (frame, this, subvol, + &local->loc, local->flags, + local->mode, local->umask, + local->fd, local->params); return 0; err: local->refresh_layout_unlock (frame, this, -1, 1); diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 9e9ca712417..613a9d39816 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -300,6 +300,7 @@ struct dht_du { uint64_t avail_space; uint32_t log; uint32_t chunks; + gf_boolean_t is_full; }; typedef struct dht_du dht_du_t; @@ -484,6 +485,7 @@ struct dht_conf { dht_du_t *du_stats; double min_free_disk; double min_free_inodes; + gf_boolean_t min_free_strict_mode; char disk_unit; int32_t refresh_interval; gf_boolean_t unhashed_sticky_bit; @@ -549,6 +551,10 @@ struct dht_conf { gf_boolean_t lock_migration_enabled; gf_lock_t lock; + + /* du stats */ + uint32_t du_refresh_interval_sec; + gf_lock_t du_refresh_lock; }; typedef struct dht_conf dht_conf_t; diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 1eb9e63c531..1b20dabc61f 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -153,19 +153,25 @@ dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc) call_frame_t *statfs_frame = NULL; dht_local_t *statfs_local = NULL; struct timeval tv = {0,}; + struct timeval cmp_tv = {0,}; loc_t tmp_loc = {0,}; conf = this->private; + /* Somebody else is already refreshing the statfs info */ + if (TRY_LOCK (&conf->du_refresh_lock) != 0) + return 0; + gettimeofday (&tv, NULL); + cmp_tv = conf->last_stat_fetch; + cmp_tv.tv_sec += conf->du_refresh_interval_sec; + /* make it root gfid, should be enough to get the proper info back */ tmp_loc.gfid[15] = 1; - if (tv.tv_sec > (conf->refresh_interval - + conf->last_stat_fetch.tv_sec)) { - + if (timercmp (&tv, &cmp_tv, >)) { statfs_frame = copy_frame (frame); if (!statfs_frame) { goto err; @@ -200,14 +206,18 @@ dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc) &tmp_loc, statfs_local->params); } - conf->last_stat_fetch.tv_sec = tv.tv_sec; + conf->last_stat_fetch = tv; } - return 0; + ret = 0; + goto out; err: if (statfs_frame) DHT_STACK_DESTROY (statfs_frame); - return -1; + ret = -1; +out: + UNLOCK (&conf->du_refresh_lock); + return ret; } @@ -223,8 +233,13 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) conf = this->private; /* Check for values above specified percent or free disk */ - LOCK (&conf->subvolume_lock); - { + if (TRY_LOCK (&conf->subvolume_lock) != 0) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + return conf->du_stats[i].is_full; + } + } + } else { for (i = 0; i < conf->subvolume_cnt; i++) { if (subvol == conf->subvolumes[i]) { if (conf->disk_unit == 'p') { @@ -248,7 +263,15 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) } } } - } + + /* i will be less than subvolume_cnt if either of + * these booleans are true */ + is_subvol_filled = ( + subvol_filled_space || subvol_filled_inodes); + if (is_subvol_filled) { + conf->du_stats[i].is_full = is_subvol_filled; + } + } UNLOCK (&conf->subvolume_lock); if (subvol_filled_space && conf->subvolume_status[i]) { @@ -273,8 +296,6 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) } } - is_subvol_filled = (subvol_filled_space || subvol_filled_inodes); - return is_subvol_filled; } @@ -309,15 +330,8 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, LOCK (&conf->subvolume_lock); { - avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, + avail_subvol = dht_subvol_maxspace_nonzeroinode(this, subvol, layout); - if(!avail_subvol) - { - avail_subvol = dht_subvol_maxspace_nonzeroinode(this, - subvol, - layout); - } - } UNLOCK (&conf->subvolume_lock); out: @@ -325,7 +339,6 @@ out: gf_msg_debug (this->name, 0, "No subvolume has enough free space \ and/or inodes to create"); - avail_subvol = subvol; } if (layout) diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c index 8abf0d59b88..ac0f0e186fa 100644 --- a/xlators/cluster/dht/src/dht-inode-read.c +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -104,10 +104,15 @@ dht_open (call_frame_t *frame, xlator_t *this, xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + conf = this->private; + + if (conf->min_free_strict_mode == _gf_true) + dht_get_du_info (frame, this, loc); local = dht_local_init (frame, loc, fd, GF_FOP_OPEN); if (!local) { @@ -121,6 +126,11 @@ dht_open (call_frame_t *frame, xlator_t *this, "no cached subvolume for fd=%p", fd); op_errno = EINVAL; goto err; + } else if (conf->min_free_strict_mode == _gf_true && + dht_is_subvol_filled (this, subvol) == _gf_true && + flags & O_APPEND) { + op_errno = ENOSPC; + goto err; } local->rebalance.flags = flags; diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index 112685b659e..7420461da76 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -161,11 +161,16 @@ dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, xlator_t *subvol = NULL; int op_errno = -1; dht_local_t *local = NULL; + loc_t *nil_loc = {0,}; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + conf = this->private; + + local = dht_local_init (frame, NULL, fd, GF_FOP_WRITE); if (!local) { @@ -173,15 +178,21 @@ dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, goto err; } + if (conf->min_free_strict_mode == _gf_true) + dht_get_du_info (frame, this, nil_loc); + subvol = local->cached_subvol; if (!subvol) { gf_msg_debug (this->name, 0, "no cached subvolume for fd=%p", fd); op_errno = EINVAL; goto err; + } else if (conf->min_free_strict_mode == _gf_true && + dht_is_subvol_filled (this, subvol) == _gf_true) { + op_errno = ENOSPC; + goto err; } - local->rebalance.vector = iov_dup (vector, count); local->rebalance.offset = off; local->rebalance.count = count; diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index a478f06b2a9..ffd8bac9e4f 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -20,7 +20,7 @@ #define GF_DISK_SECTOR_SIZE 512 #define DHT_REBALANCE_PID 4242 /* Change it if required */ -#define DHT_REBALANCE_BLKSIZE (128 * 1024) +#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */ #define MAX_MIGRATE_QUEUE_COUNT 500 #define MIN_MIGRATE_QUEUE_COUNT 200 diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 5c810f0dc77..ccbf66b626d 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -443,6 +443,8 @@ dht_reconfigure (xlator_t *this, dict_t *options) conf->disk_unit = 0; if (conf->min_free_disk < 100.0) conf->disk_unit = 'p'; + GF_OPTION_RECONF ("min-free-strict-mode", conf->min_free_strict_mode, + options, bool, out); GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, percent, out); @@ -499,6 +501,9 @@ dht_reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("use-readdirp", conf->use_readdirp, options, bool, out); + + GF_OPTION_RECONF ("du-refresh-interval-sec", + conf->du_refresh_interval_sec, options, uint32, out); ret = 0; out: return ret; @@ -720,7 +725,10 @@ dht_init (xlator_t *this) GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, - err); + err); + + GF_OPTION_INIT ("min-free-strict-mode", conf->min_free_strict_mode, + bool, err); GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent, err); @@ -738,6 +746,11 @@ dht_init (xlator_t *this) GF_OPTION_INIT ("lock-migration", conf->lock_migration_enabled, bool, err); + GF_OPTION_INIT ("du-refresh-interval-sec", + conf->du_refresh_interval_sec, uint32, err); + + LOCK_INIT (&conf->du_refresh_lock); + if (defrag) { defrag->lock_migration_enabled = conf->lock_migration_enabled; @@ -907,6 +920,14 @@ struct volume_options options[] = { "process starts balancing out the cluster, and logs will appear " "in log files", }, + { .key = {"min-free-strict-mode"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "When enabled, will reject in-flight writes or " + "append operations to files when the target subvolume falls " + "below min-free-(disk|inodes). When disabled, these are allowed " + "through and only new files will be affected.", + }, { .key = {"min-free-inodes"}, .type = GF_OPTION_TYPE_PERCENT, .default_value = "5%", @@ -1089,5 +1110,14 @@ struct volume_options options[] = { " associated with a file during rebalance" }, + { .key = {"du-refresh-interval-sec"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "60", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Specifies how many seconds before subvolume statfs " + "info is re-validated." + }, + { .key = {NULL} }, }; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index 56e17d6e884..996faffa37f 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -325,7 +325,10 @@ nufa_create (call_frame_t *frame, xlator_t *this, local); } - if (subvol != avail_subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (subvol != avail_subvol) { /* create a link file instead of actual file */ local->params = dict_ref (params); local->mode = mode; @@ -430,7 +433,10 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, local); } - if (avail_subvol != subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (avail_subvol != subvol) { /* Create linkfile first */ local->params = dict_ref (params); diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c index f1e9a399442..8b14ac99b8f 100644 --- a/xlators/cluster/dht/src/switch.c +++ b/xlators/cluster/dht/src/switch.c @@ -440,7 +440,10 @@ switch_create (call_frame_t *frame, xlator_t *this, local); } - if (subvol != avail_subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (subvol != avail_subvol) { /* create a link file instead of actual file */ local->mode = mode; local->flags = flags; @@ -540,7 +543,10 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, local); } - if (avail_subvol != subvol) { + if (!avail_subvol) { + op_errno = ENOSPC; + goto err; + } else if (avail_subvol != subvol) { /* Create linkfile first */ local->params = dict_ref (params); |
