summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2014-09-13 12:08:56 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2014-09-24 01:51:35 -0700
commit70d76f20ee127fe7e8e52b2d67e2362283a01f34 (patch)
treea0bd58a4f86dbe26763dc6b008818eac7e190e40
parentbd592f8b8379087604f35c3b377f6e94b9e1697d (diff)
cluster/afr: Fix spurious metadata self-heals
- Added logging for metadata and data self-heals which helped in debugging this issue. - Added checks to skip self-heals when no sinks are available to heal Change-Id: I0d50dceb84cd9ad4fe00e0b749ddf7d4ff42348a BUG: 1128721 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: http://review.gluster.org/8709 Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com>
-rw-r--r--xlators/cluster/afr/src/afr-common.c7
-rw-r--r--xlators/cluster/afr/src/afr-read-txn.c3
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c32
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c38
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c29
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h4
-rw-r--r--xlators/cluster/afr/src/afr.h2
7 files changed, 86 insertions, 29 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index d1bce6284a8..3e745e2491e 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -1198,8 +1198,8 @@ xattr_is_equal (dict_t *this, char *key1, data_t *value1, void *data)
* We need to do both because ignoring glusterfs' internal xattrs
* happens only in xattr_is_equal().
*/
-static gf_boolean_t
-dicts_are_equal (dict_t *dict1, dict_t *dict2)
+gf_boolean_t
+afr_xattrs_are_equal (dict_t *dict1, dict_t *dict2)
{
int ret = 0;
@@ -1596,7 +1596,8 @@ afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this)
}
/*Check if xattrs need heal*/
- if (!dicts_are_equal (replies[first].xdata, replies[i].xdata))
+ if (!afr_xattrs_are_equal (replies[first].xdata,
+ replies[i].xdata))
start = _gf_true;
}
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index 2585c443e1c..29a926dbd97 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -202,6 +202,9 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
/* very first transaction on this inode */
goto refresh;
+ gf_log (this->name, GF_LOG_DEBUG, "%s: generation now vs cached: %d, "
+ "%d", uuid_utoa (inode->gfid), local->event_generation,
+ event_generation);
if (local->event_generation != event_generation)
/* servers have disconnected / reconnected, and possibly
rebooted, very likely changing the state of freshness
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index b104e6b7869..b855f98ddf2 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -384,6 +384,38 @@ afr_selfheal_find_direction (xlator_t *this, struct afr_reply *replies,
return 0;
}
+void
+afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type,
+ int source, unsigned char *healed_sinks)
+{
+ char *status = NULL;
+ char *sinks_str = NULL;
+ char *p = NULL;
+ afr_private_t *priv = NULL;
+ gf_loglevel_t loglevel = GF_LOG_NONE;
+ int i = 0;
+
+ priv = this->private;
+ sinks_str = alloca0 (priv->child_count * 8);
+ p = sinks_str;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+ p += sprintf (p, "%d ", i);
+ }
+
+ if (ret < 0) {
+ status = "Failed";
+ loglevel = GF_LOG_DEBUG;
+ } else {
+ status = "Completed";
+ loglevel = GF_LOG_INFO;
+ }
+
+ gf_log (this->name, loglevel, "%s %s selfheal on %s. "
+ "source=%d sinks=%s", status, type, uuid_utoa (gfid),
+ source, sinks_str);
+}
int
afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 25e310fd2a3..74088f4bf6d 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -300,29 +300,14 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct afr_reply *replies)
{
afr_private_t *priv = NULL;
- int i = 0;
off_t off = 0;
size_t block = 128 * 1024;
int type = AFR_SELFHEAL_DATA_FULL;
int ret = -1;
call_frame_t *iter_frame = NULL;
- char *sinks_str = NULL;
- char *p = NULL;
priv = this->private;
- sinks_str = alloca0 (priv->child_count * 8);
- p = sinks_str;
- for (i = 0; i < priv->child_count; i++) {
- if (!healed_sinks[i])
- continue;
- p += sprintf (p, "%d ", i);
- }
-
- gf_log (this->name, GF_LOG_INFO, "performing data selfheal on %s. "
- "source=%d sinks=%s",
- uuid_utoa (fd->inode->gfid), source, sinks_str);
-
type = afr_data_self_heal_type_get (priv, healed_sinks, source,
replies);
@@ -331,6 +316,11 @@ afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
return -ENOMEM;
for (off = 0; off < replies[source].poststat.ia_size; off += block) {
+ if (AFR_COUNT (healed_sinks, priv->child_count) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
ret = afr_selfheal_data_block (iter_frame, this, fd, source,
healed_sinks, off, block, type,
replies);
@@ -511,6 +501,10 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
data_lock);
{
if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ gf_log (this->name, GF_LOG_DEBUG, "%s: Skipping "
+ "self-heal as only %d number of subvolumes "
+ "could be locked", uuid_utoa (fd->inode->gfid),
+ ret);
ret = -ENOTCONN;
goto unlock;
}
@@ -559,6 +553,9 @@ out:
afr_selfheal_uninodelk (frame, this, fd->inode, this->name,
LLONG_MAX - 2, 1, compat_lock);
+ afr_log_selfheal (fd->inode->gfid, this, ret, "data", source,
+ healed_sinks);
+
if (locked_replies)
afr_replies_wipe (locked_replies, priv->child_count);
@@ -605,8 +602,11 @@ afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)
priv = this->private;
fd = afr_selfheal_data_open (this, inode);
- if (!fd)
- return -EIO;
+ if (!fd) {
+ gf_log (this->name, GF_LOG_DEBUG, "%s: Failed to open",
+ uuid_utoa (inode->gfid));
+ return -EIO;
+ }
locked_on = alloca0 (priv->child_count);
@@ -614,6 +614,10 @@ afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)
locked_on);
{
if (ret < AFR_SH_MIN_PARTICIPANTS) {
+ gf_log (this->name, GF_LOG_DEBUG, "%s: Skipping "
+ "self-heal as only %d number of subvolumes "
+ "could be locked", uuid_utoa (fd->inode->gfid),
+ ret);
/* Either less than two subvols available, or another
selfheal (from another server) is in progress. Skip
for now in any case there isn't anything to do.
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 2c5f3fd652c..5b9e290e49a 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -97,7 +97,6 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
struct iatt first = {0, };
int source = -1;
int sources_count = 0;
- int ret = 0;
priv = this->private;
@@ -136,6 +135,10 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
!IA_EQUAL (first, replies[i].poststat, uid) ||
!IA_EQUAL (first, replies[i].poststat, gid) ||
!IA_EQUAL (first, replies[i].poststat, prot)) {
+ gf_log (this->name, GF_LOG_DEBUG, "%s: iatt mismatch "
+ "for source(%d) vs (%d)",
+ uuid_utoa (replies[source].poststat.ia_gfid),
+ source, i);
sources[i] = 0;
healed_sinks[i] = 1;
}
@@ -144,14 +147,12 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
for (i =0; i < priv->child_count; i++) {
if (!sources[i] || i == source)
continue;
- if (replies[source].xdata->count != replies[i].xdata->count) {
- sources[i] = 0;
- healed_sinks[i] = 1;
- continue;
- }
- ret = dict_foreach(replies[source].xdata, xattr_is_equal,
- (void*)replies[i].xdata);
- if (ret == -1) {
+ if (!afr_xattrs_are_equal (replies[source].xdata,
+ replies[i].xdata)) {
+ gf_log (this->name, GF_LOG_DEBUG, "%s: xattr mismatch "
+ "for source(%d) vs (%d)",
+ uuid_utoa (replies[source].poststat.ia_gfid),
+ source, i);
sources[i] = 0;
healed_sinks[i] = 1;
}
@@ -241,6 +242,12 @@ __afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode,
goto unlock;
source = ret;
+
+ if (AFR_COUNT (healed_sinks, priv->child_count) == 0) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
ret = __afr_selfheal_metadata_do (frame, this, inode, source,
healed_sinks, locked_replies);
if (ret)
@@ -254,6 +261,10 @@ __afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode,
unlock:
afr_selfheal_uninodelk (frame, this, inode, this->name,
LLONG_MAX -1, 0, data_lock);
+
+ afr_log_selfheal (inode->gfid, this, ret, "metadata", source,
+ healed_sinks);
+
if (locked_replies)
afr_replies_wipe (locked_replies, priv->child_count);
return ret;
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 7936659e5e4..c32ec120a50 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -185,4 +185,8 @@ afr_inode_link (inode_t *inode, struct iatt *iatt);
unsigned int
afr_success_count (struct afr_reply *replies, unsigned int count);
+
+void
+afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type,
+ int source, unsigned char *healed_sinks);
#endif /* !_AFR_SELFHEAL_H */
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 4b894c5c464..2b0ced58f6c 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1006,4 +1006,6 @@ afr_remove_eager_lock_stub (afr_local_t *local);
void
afr_replies_wipe (struct afr_reply *replies, int count);
+gf_boolean_t
+afr_xattrs_are_equal (dict_t *dict1, dict_t *dict2);
#endif /* __AFR_H__ */