diff options
author | Pranith Kumar K <pkarampu@redhat.com> | 2014-09-13 12:08:56 +0530 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2014-09-24 10:38:57 -0700 |
commit | dd80d06145a5911e697b724a44fd4d858e3a9134 (patch) | |
tree | e0f5ce6cf128a3e9f925b578e2f5ce69710a0dc3 | |
parent | 3b4664d7aa8eda5011fedeaeaad3281c1b06a517 (diff) |
cluster/afr: Fix spurious metadata self-healsv3.6.0beta2
Backport of http://review.gluster.org/8709
- Added logging for metadata and data self-heals which helped
in debugging this issue.
- Added checks to skip self-heals when no sinks are available to heal
BUG: 1145987
Change-Id: Ide03af4f531a1280ec8ad95b627285df4d7bc42d
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: http://review.gluster.org/8832
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 7 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-read-txn.c | 3 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 32 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 38 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-metadata.c | 29 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 4 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 2 |
7 files changed, 86 insertions, 29 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index d1bce6284a8..3e745e2491e 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1198,8 +1198,8 @@ xattr_is_equal (dict_t *this, char *key1, data_t *value1, void *data) * We need to do both because ignoring glusterfs' internal xattrs * happens only in xattr_is_equal(). */ -static gf_boolean_t -dicts_are_equal (dict_t *dict1, dict_t *dict2) +gf_boolean_t +afr_xattrs_are_equal (dict_t *dict1, dict_t *dict2) { int ret = 0; @@ -1596,7 +1596,8 @@ afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this) } /*Check if xattrs need heal*/ - if (!dicts_are_equal (replies[first].xdata, replies[i].xdata)) + if (!afr_xattrs_are_equal (replies[first].xdata, + replies[i].xdata)) start = _gf_true; } diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index 2585c443e1c..29a926dbd97 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -202,6 +202,9 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, /* very first transaction on this inode */ goto refresh; + gf_log (this->name, GF_LOG_DEBUG, "%s: generation now vs cached: %d, " + "%d", uuid_utoa (inode->gfid), local->event_generation, + event_generation); if (local->event_generation != event_generation) /* servers have disconnected / reconnected, and possibly rebooted, very likely changing the state of freshness diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index b104e6b7869..b855f98ddf2 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -384,6 +384,38 @@ afr_selfheal_find_direction (xlator_t *this, struct afr_reply *replies, return 0; } +void +afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type, + int source, unsigned char *healed_sinks) +{ + char *status = NULL; + char *sinks_str = NULL; + char *p = NULL; + afr_private_t *priv = NULL; + gf_loglevel_t loglevel = GF_LOG_NONE; + int i = 0; + + priv = this->private; + sinks_str = alloca0 (priv->child_count * 8); + p = sinks_str; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + p += sprintf (p, "%d ", i); + } + + if (ret < 0) { + status = "Failed"; + loglevel = GF_LOG_DEBUG; + } else { + status = "Completed"; + loglevel = GF_LOG_INFO; + } + + gf_log (this->name, loglevel, "%s %s selfheal on %s. " + "source=%d sinks=%s", status, type, uuid_utoa (gfid), + source, sinks_str); +} int afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 25e310fd2a3..74088f4bf6d 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -300,29 +300,14 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, struct afr_reply *replies) { afr_private_t *priv = NULL; - int i = 0; off_t off = 0; size_t block = 128 * 1024; int type = AFR_SELFHEAL_DATA_FULL; int ret = -1; call_frame_t *iter_frame = NULL; - char *sinks_str = NULL; - char *p = NULL; priv = this->private; - sinks_str = alloca0 (priv->child_count * 8); - p = sinks_str; - for (i = 0; i < priv->child_count; i++) { - if (!healed_sinks[i]) - continue; - p += sprintf (p, "%d ", i); - } - - gf_log (this->name, GF_LOG_INFO, "performing data selfheal on %s. " - "source=%d sinks=%s", - uuid_utoa (fd->inode->gfid), source, sinks_str); - type = afr_data_self_heal_type_get (priv, healed_sinks, source, replies); @@ -331,6 +316,11 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, return -ENOMEM; for (off = 0; off < replies[source].poststat.ia_size; off += block) { + if (AFR_COUNT (healed_sinks, priv->child_count) == 0) { + ret = -ENOTCONN; + goto out; + } + ret = afr_selfheal_data_block (iter_frame, this, fd, source, healed_sinks, off, block, type, replies); @@ -511,6 +501,10 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd, data_lock); { if (ret < AFR_SH_MIN_PARTICIPANTS) { + gf_log (this->name, GF_LOG_DEBUG, "%s: Skipping " + "self-heal as only %d number of subvolumes " + "could be locked", uuid_utoa (fd->inode->gfid), + ret); ret = -ENOTCONN; goto unlock; } @@ -559,6 +553,9 @@ out: afr_selfheal_uninodelk (frame, this, fd->inode, this->name, LLONG_MAX - 2, 1, compat_lock); + afr_log_selfheal (fd->inode->gfid, this, ret, "data", source, + healed_sinks); + if (locked_replies) afr_replies_wipe (locked_replies, priv->child_count); @@ -605,8 +602,11 @@ afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode) priv = this->private; fd = afr_selfheal_data_open (this, inode); - if (!fd) - return -EIO; + if (!fd) { + gf_log (this->name, GF_LOG_DEBUG, "%s: Failed to open", + uuid_utoa (inode->gfid)); + return -EIO; + } locked_on = alloca0 (priv->child_count); @@ -614,6 +614,10 @@ afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode) locked_on); { if (ret < AFR_SH_MIN_PARTICIPANTS) { + gf_log (this->name, GF_LOG_DEBUG, "%s: Skipping " + "self-heal as only %d number of subvolumes " + "could be locked", uuid_utoa (fd->inode->gfid), + ret); /* Either less than two subvols available, or another selfheal (from another server) is in progress. Skip for now in any case there isn't anything to do. diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 2c5f3fd652c..5b9e290e49a 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -97,7 +97,6 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, struct iatt first = {0, }; int source = -1; int sources_count = 0; - int ret = 0; priv = this->private; @@ -136,6 +135,10 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, !IA_EQUAL (first, replies[i].poststat, uid) || !IA_EQUAL (first, replies[i].poststat, gid) || !IA_EQUAL (first, replies[i].poststat, prot)) { + gf_log (this->name, GF_LOG_DEBUG, "%s: iatt mismatch " + "for source(%d) vs (%d)", + uuid_utoa (replies[source].poststat.ia_gfid), + source, i); sources[i] = 0; healed_sinks[i] = 1; } @@ -144,14 +147,12 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, for (i =0; i < priv->child_count; i++) { if (!sources[i] || i == source) continue; - if (replies[source].xdata->count != replies[i].xdata->count) { - sources[i] = 0; - healed_sinks[i] = 1; - continue; - } - ret = dict_foreach(replies[source].xdata, xattr_is_equal, - (void*)replies[i].xdata); - if (ret == -1) { + if (!afr_xattrs_are_equal (replies[source].xdata, + replies[i].xdata)) { + gf_log (this->name, GF_LOG_DEBUG, "%s: xattr mismatch " + "for source(%d) vs (%d)", + uuid_utoa (replies[source].poststat.ia_gfid), + source, i); sources[i] = 0; healed_sinks[i] = 1; } @@ -241,6 +242,12 @@ __afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode, goto unlock; source = ret; + + if (AFR_COUNT (healed_sinks, priv->child_count) == 0) { + ret = -ENOTCONN; + goto unlock; + } + ret = __afr_selfheal_metadata_do (frame, this, inode, source, healed_sinks, locked_replies); if (ret) @@ -254,6 +261,10 @@ __afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode, unlock: afr_selfheal_uninodelk (frame, this, inode, this->name, LLONG_MAX -1, 0, data_lock); + + afr_log_selfheal (inode->gfid, this, ret, "metadata", source, + healed_sinks); + if (locked_replies) afr_replies_wipe (locked_replies, priv->child_count); return ret; diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 7936659e5e4..c32ec120a50 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -185,4 +185,8 @@ afr_inode_link (inode_t *inode, struct iatt *iatt); unsigned int afr_success_count (struct afr_reply *replies, unsigned int count); + +void +afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type, + int source, unsigned char *healed_sinks); #endif /* !_AFR_SELFHEAL_H */ diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 4b894c5c464..2b0ced58f6c 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1006,4 +1006,6 @@ afr_remove_eager_lock_stub (afr_local_t *local); void afr_replies_wipe (struct afr_reply *replies, int count); +gf_boolean_t +afr_xattrs_are_equal (dict_t *dict1, dict_t *dict2); #endif /* __AFR_H__ */ |