summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPranith Kumar K <pranithk@gluster.com>2011-09-03 14:13:06 +0530
committerAnand Avati <avati@gluster.com>2011-09-06 02:24:42 -0700
commit10d49cf0878e96301ea366276e27e91b5527483e (patch)
tree147dfb8ebcefff4a0b5b621878609812c315ba9e
parentf1fec244b043fc74999692b181d321827daf8677 (diff)
cluster/afr: Prevent double big lock when data self-heal loops are not spawned
The steps in normal data self heal: 1) take big lock by self-heal frame. Get the xattrs/stat to decide source, sink information. 2) spawn loop frames which perform self-heal by taking small locks on the file. Every time a new lock is taken and the old lock is released. 3) Before releasing the final small lock a big lock is taken by the self-heal frame, and unlock on small-lock. Erasing of the pending xattrs happen then the big unlock happen and that is the end of the data self-heal. When a data self-heal is needed for a file and the fop that triggers the self-heal is open with O_TRUNC. Fuse sends open then an explicit truncate for this. Open triggers the self-heal but by the time it tries to spawn the loops the file size is truncated to 0, so no loops are formed. These are the steps: 1) Take big lock by self-heal frame. Get the xattrs/stat to decide source, sink information. 2) loop frames are not spawned. The big lock is not released. 3) One more big lock is taken by the same self-heal frame, Erasing of the pending xattrs etc happen, now it does two big unlocks, but after the first unlock, the information on which the locks were performed is forgotten, so the next unlock becomes a no-op. So there is a stale big lock on that file preventing further writes. As a fix, if the loops are not spawned, use the previous big lock to perform the rest of the operations needed in completing the data self-heal. No need to have one more big lock. Change-Id: Id03171269594e447b2b6d1331e362d83bd1e3430 BUG: 3506 Reviewed-on: http://review.gluster.com/339 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Anand Avati <avati@gluster.com>
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-algorithm.c15
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c22
2 files changed, 30 insertions, 7 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
index 1c7cdf418..63889c845 100644
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c
+++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
@@ -92,7 +92,7 @@ sh_number_of_writes_needed (unsigned char *write_needed, int child_count)
static int
-sh_loop_driver_done (call_frame_t *frame, xlator_t *this,
+sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this,
call_frame_t *last_loop_frame)
{
afr_private_t *priv = NULL;
@@ -103,13 +103,13 @@ sh_loop_driver_done (call_frame_t *frame, xlator_t *this,
int32_t diff_blocks = 0;
priv = this->private;
- local = frame->local;
+ local = sh_frame->local;
sh = &local->self_heal;
sh_priv = sh->private;
total_blocks = sh_priv->total_blocks;
diff_blocks = sh_priv->diff_blocks;
- sh_private_cleanup (frame, this);
+ sh_private_cleanup (sh_frame, this);
if (sh->op_failed) {
GF_ASSERT (!last_loop_frame);
//loop_finish should have happened and the old_loop should be NULL
@@ -117,7 +117,7 @@ sh_loop_driver_done (call_frame_t *frame, xlator_t *this,
"self-heal aborting on %s",
local->loc.path);
- local->self_heal.algo_abort_cbk (frame, this);
+ local->self_heal.algo_abort_cbk (sh_frame, this);
} else {
GF_ASSERT (last_loop_frame);
if (diff_blocks == total_blocks) {
@@ -131,8 +131,11 @@ sh_loop_driver_done (call_frame_t *frame, xlator_t *this,
((diff_blocks * 1.0)/total_blocks) * 100);
}
- sh->old_loop_frame = last_loop_frame;
- local->self_heal.algo_completion_cbk (frame, this);
+ if (sh_frame == last_loop_frame)
+ sh->old_loop_frame = NULL;
+ else
+ sh->old_loop_frame = last_loop_frame;
+ local->self_heal.algo_completion_cbk (sh_frame, this);
}
return 0;
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 60f42c54b..009ac705b 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -63,6 +63,15 @@ int
afr_sh_data_finish (call_frame_t *frame, xlator_t *this);
int
+afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this,
+ afr_fxattrop_cbk_t fxattrop_cbk);
+
+int
+afr_post_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xattr);
+
+int
afr_sh_data_done (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
@@ -353,15 +362,26 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
int32_t op_errno, dict_t *xattr)
{
int call_count = 0;
+ afr_local_t *local = NULL;
+ afr_self_heal_t *sh = NULL;
call_count = afr_frame_return (frame);
if (call_count == 0) {
+ local = frame->local;
+ sh = &local->self_heal;
+ if (NULL == sh->old_loop_frame) {
+ GF_ASSERT (sh->data_lock_held);
+ afr_sh_data_fxattrop (frame, this,
+ afr_post_sh_data_fxattrop_cbk);
+ goto out;
+ }
+
afr_sh_data_lock (frame, this, 0, 0,
afr_post_sh_big_lock_success,
afr_post_sh_big_lock_failure);
}
-
+out:
return 0;
}