diff options
author | Pranith Kumar K <pkarampu@redhat.com> | 2014-09-23 12:43:02 +0530 |
---|---|---|
committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2014-09-23 07:21:44 -0700 |
commit | e149a051bf226e16c6b7f1a816f998dace85d33d (patch) | |
tree | 8b454f6b00281aaf99dbf69c0c726f0ba2f493ea | |
parent | 371bb42410ca5bbcf1f13ad1c8d015fcbe6ec5ce (diff) |
cluster/afr: Don't start heal when lookup succeeds on < 2 children
Problem:
When self-heal code doesn't see at least 2 successes on looking up children,
then self-heal can't be done. What is happening now is if all the lookups fail
then the pending changelog is all zeros in xattrs so all the children are
becoming sources and leading to crashes when the code paths further assume that
some data structures are populated properly
Fix:
Don't proceed with self-heals when < 2 children succeed lookups.
BUG: 1128721
Change-Id: Iffdf0feebb6f98812d9d01cdd0cf97f3e19ba76f
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: http://review.gluster.org/8698
Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com>
Tested-by: Gluster Build System <jenkins@build.gluster.com>
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 17 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 4 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-entry.c | 6 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-metadata.c | 4 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-name.c | 2 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 4 |
6 files changed, 29 insertions, 8 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 0158948d728..b104e6b7869 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -324,6 +324,12 @@ afr_selfheal_find_direction (xlator_t *this, struct afr_reply *replies, accused = alloca0 (priv->child_count); matrix = ALLOC_MATRIX(priv->child_count, int); + if (afr_success_count (replies, + priv->child_count) < AFR_SH_MIN_PARTICIPANTS) { + /* Treat this just like locks not being acquired */ + return -ENOTCONN; + } + /* First construct the pending matrix for further analysis */ afr_selfheal_extract_xattr (this, replies, type, dirty, matrix); @@ -502,6 +508,17 @@ afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, priv->child_up); } +unsigned int +afr_success_count (struct afr_reply *replies, unsigned int count) +{ + int i = 0; + unsigned int success = 0; + + for (i = 0; i < count; i++) + if (replies[i].valid && replies[i].op_ret == 0) + success++; + return success; +} int afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 455648b7564..bee7682a23b 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -508,7 +508,7 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd, ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0, data_lock); { - if (ret < 2) { + if (ret < AFR_SH_MIN_PARTICIPANTS) { ret = -ENOTCONN; goto unlock; } @@ -611,7 +611,7 @@ afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode) ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); { - if (ret < 2) { + if (ret < AFR_SH_MIN_PARTICIPANTS) { /* Either less than two subvols available, or another selfheal (from another server) is in progress. Skip for now in any case there isn't anything to do. diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 0cf65009c5f..45ce881e123 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -283,7 +283,7 @@ afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, name, locked_on); { - if (ret < 2) { + if (ret < AFR_SH_MIN_PARTICIPANTS) { ret = -ENOTCONN; goto unlock; } @@ -491,7 +491,7 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd, ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, NULL, data_lock); { - if (ret < 2) { + if (ret < AFR_SH_MIN_PARTICIPANTS) { ret = -ENOTCONN; goto unlock; } @@ -567,7 +567,7 @@ afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode) ret = afr_selfheal_tryentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on); { - if (ret < 2) { + if (ret < AFR_SH_MIN_PARTICIPANTS) { /* Either less than two subvols available, or another selfheal (from another server) is in progress. Skip for now in any case there isn't anything to do. diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index e98728ba54f..2c5f3fd652c 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -228,7 +228,7 @@ __afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode, ret = afr_selfheal_inodelk (frame, this, inode, this->name, LLONG_MAX - 1, 0, data_lock); { - if (ret < 2) { + if (ret < AFR_SH_MIN_PARTICIPANTS) { ret = -ENOTCONN; goto unlock; } @@ -274,7 +274,7 @@ afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode) ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); { - if (ret < 2) { + if (ret < AFR_SH_MIN_PARTICIPANTS) { /* Either less than two subvols available, or another selfheal (from another server) is in progress. Skip for now in any case there isn't anything to do. diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c index a3020f4e1a7..c5d126185c7 100644 --- a/xlators/cluster/afr/src/afr-self-heal-name.c +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -571,7 +571,7 @@ afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, ret = afr_selfheal_entrylk (frame, this, parent, this->name, bname, locked_on); { - if (ret < 2) { + if (ret < AFR_SH_MIN_PARTICIPANTS) { ret = -ENOTCONN; goto unlock; } diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 31f12a4e74a..7936659e5e4 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -12,6 +12,7 @@ #ifndef _AFR_SELFHEAL_H #define _AFR_SELFHEAL_H +#define AFR_SH_MIN_PARTICIPANTS 2 /* Perform fop on all UP subvolumes and wait for all callbacks to return */ @@ -181,4 +182,7 @@ afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode, inode_t* afr_inode_link (inode_t *inode, struct iatt *iatt); + +unsigned int +afr_success_count (struct afr_reply *replies, unsigned int count); #endif /* !_AFR_SELFHEAL_H */ |