diff options
-rw-r--r-- | tests/bugs/distribute/bug-1543279.t | 65 | ||||
-rw-r--r-- | tests/include.rc | 3 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 175 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 10 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-helper.c | 1 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-lock.c | 29 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 63 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-rename.c | 361 |
8 files changed, 625 insertions, 82 deletions
diff --git a/tests/bugs/distribute/bug-1543279.t b/tests/bugs/distribute/bug-1543279.t new file mode 100644 index 00000000000..67cc0f50c2f --- /dev/null +++ b/tests/bugs/distribute/bug-1543279.t @@ -0,0 +1,65 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../dht.rc + +TESTS_EXPECTED_IN_LOOP=44 +SCRIPT_TIMEOUT=600 + +rename_files() { + MOUNT=$1 + ITERATIONS=$2 + for i in $(seq 1 $ITERATIONS); do uuid="`uuidgen`"; echo "some data" > $MOUNT/test$uuid; mv $MOUNT/test$uuid $MOUNT/test -f || return $?; done +} + +run_test_for_volume() { + VOLUME=$1 + ITERATIONS=$2 + TEST_IN_LOOP $CLI volume start $VOLUME + + TEST_IN_LOOP glusterfs -s $H0 --volfile-id $VOLUME $M0 + TEST_IN_LOOP glusterfs -s $H0 --volfile-id $VOLUME $M1 + TEST_IN_LOOP glusterfs -s $H0 --volfile-id $VOLUME $M2 + TEST_IN_LOOP glusterfs -s $H0 --volfile-id $VOLUME $M3 + + rename_files $M0 $ITERATIONS & + M0_RENAME_PID=$! + + rename_files $M1 $ITERATIONS & + M1_RENAME_PID=$! + + rename_files $M2 $ITERATIONS & + M2_RENAME_PID=$! + + rename_files $M3 $ITERATIONS & + M3_RENAME_PID=$! + + TEST_IN_LOOP wait $M0_RENAME_PID + TEST_IN_LOOP wait $M1_RENAME_PID + TEST_IN_LOOP wait $M2_RENAME_PID + TEST_IN_LOOP wait $M3_RENAME_PID + + TEST_IN_LOOP $CLI volume stop $VOLUME + TEST_IN_LOOP $CLI volume delete $VOLUME + umount $M0 $M1 $M2 $M3 +} + +cleanup + +TEST glusterd +TEST pidof glusterd + +TEST $CLI volume create $V0 $H0:$B0/${V0}{0..8} force +run_test_for_volume $V0 200 + +TEST $CLI volume create $V0 replica 3 arbiter 1 $H0:$B0/${V0}{0..8} force +run_test_for_volume $V0 200 + +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0..8} force +run_test_for_volume $V0 200 + +TEST $CLI volume create $V0 disperse 6 redundancy 2 $H0:$B0/${V0}{0..5} force +run_test_for_volume $V0 200 + +cleanup diff --git a/tests/include.rc b/tests/include.rc index f02bba1442b..5a99e027df8 100644 --- a/tests/include.rc +++ b/tests/include.rc @@ -1,6 +1,7 @@ M0=${M0:=/mnt/glusterfs/0}; # 0th mount point for FUSE M1=${M1:=/mnt/glusterfs/1}; # 1st mount point for FUSE M2=${M2:=/mnt/glusterfs/2}; # 2nd mount point for FUSE +M3=${M3:=/mnt/glusterfs/3}; # 3rd mount point for FUSE N0=${N0:=/mnt/nfs/0}; # 0th mount point for NFS N1=${N1:=/mnt/nfs/1}; # 1st mount point for NFS V0=${V0:=patchy}; # volume name to use in tests @@ -8,7 +9,7 @@ V1=${V1:=patchy1}; # volume name to use in tests GMV0=${GMV0:=master}; # master volume name to use in geo-rep tests GSV0=${GSV0:=slave}; # slave volume name to use in geo-rep tests B0=${B0:=/d/backends}; # top level of brick directories -WORKDIRS="$B0 $M0 $M1 $M2 $N0 $N1" +WORKDIRS="$B0 $M0 $M1 $M2 $M3 $N0 $N1" ROOT_GFID="00000000-0000-0000-0000-000000000001" DOT_SHARD_GFID="be318638-e8a0-4c6d-977d-7a937aa84806" diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 3492e7299ed..7a3f7eda935 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -1931,7 +1931,6 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, GF_VALIDATE_OR_GOTO ("dht", this, out); GF_VALIDATE_OR_GOTO ("dht", frame->local, out); GF_VALIDATE_OR_GOTO ("dht", this->private, out); - GF_VALIDATE_OR_GOTO ("dht", cookie, out); local = frame->local; cached_subvol = local->cached_subvol; @@ -1939,6 +1938,9 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, gf_uuid_unparse(local->loc.gfid, gfid); + if (local->locked) + dht_unlock_namespace (frame, &local->lock[0]); + ret = dht_layout_preset (this, local->cached_subvol, local->loc.inode); if (ret < 0) { gf_msg_debug (this->name, EINVAL, @@ -1962,6 +1964,7 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, postparent, 1); } + unwind: gf_msg_debug (this->name, 0, "creation of linkto on hashed subvol:%s, " @@ -2133,6 +2136,134 @@ err: return -1; } + +int32_t +dht_linkfile_create_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + dht_local_t *local = NULL; + int call_cnt = 0, ret = 0; + xlator_t *subvol = NULL; + uuid_t gfid = {0, }; + char gfid_str[GF_UUID_BUF_SIZE] = {0}; + + subvol = cookie; + local = frame->local; + + if (subvol == local->hashed_subvol) { + if ((op_ret == 0) || (op_errno != ENOENT)) + local->dont_create_linkto = _gf_true; + } else { + if (gf_uuid_is_null (local->gfid)) + gf_uuid_copy (gfid, local->loc.gfid); + else + gf_uuid_copy (gfid, local->gfid); + + if ((op_ret == 0) && gf_uuid_compare (gfid, buf->ia_gfid)) { + gf_uuid_unparse (gfid, gfid_str); + gf_msg_debug (this->name, 0, + "gfid (%s) different on cached subvol " + "(%s) and looked up inode (%s), not " + "creating linkto", + uuid_utoa (buf->ia_gfid), subvol->name, + gfid_str); + local->dont_create_linkto = _gf_true; + } else if (op_ret == -1) { + local->dont_create_linkto = _gf_true; + } + } + + call_cnt = dht_frame_return (frame); + if (is_last_call (call_cnt)) { + if (local->dont_create_linkto) + goto no_linkto; + else { + gf_msg_debug (this->name, 0, + "Creating linkto file on %s(hash) to " + "%s on %s (gfid = %s)", + local->hashed_subvol->name, + local->loc.path, + local->cached_subvol->name, gfid); + + ret = dht_linkfile_create + (frame, dht_lookup_linkfile_create_cbk, + this, local->cached_subvol, + local->hashed_subvol, &local->loc); + + if (ret < 0) + goto no_linkto; + } + } + + return 0; + +no_linkto: + gf_msg_debug (this->name, 0, + "skipped linkto creation (path:%s) (gfid:%s) " + "(hashed-subvol:%s) (cached-subvol:%s)", + local->loc.path, gfid_str, local->hashed_subvol->name, + local->cached_subvol->name); + + dht_lookup_linkfile_create_cbk (frame, NULL, this, 0, 0, + local->loc.inode, &local->stbuf, + &local->preparent, &local->postparent, + local->xattr); + return 0; +} + + +int32_t +dht_call_lookup_linkfile_create (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + int i = 0; + xlator_t *subvol = NULL; + + local = frame->local; + if (gf_uuid_is_null (local->gfid)) + gf_uuid_unparse (local->loc.gfid, gfid); + else + gf_uuid_unparse (local->gfid, gfid); + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "protecting namespace failed, skipping linkto " + "creation (path:%s)(gfid:%s)(hashed-subvol:%s)" + "(cached-subvol:%s)", local->loc.path, gfid, + local->hashed_subvol->name, local->cached_subvol->name); + goto err; + } + + local->locked = _gf_true; + + + local->call_cnt = 2; + + for (i = 0; i < 2; i++) { + subvol = (subvol == NULL) ? local->hashed_subvol + : local->cached_subvol; + + STACK_WIND_COOKIE (frame, dht_linkfile_create_lookup_cbk, + subvol, subvol, subvol->fops->lookup, + &local->loc, NULL); + } + + return 0; + +err: + dht_lookup_linkfile_create_cbk (frame, NULL, this, 0, 0, + local->loc.inode, + &local->stbuf, &local->preparent, + &local->postparent, local->xattr); + return 0; +} + /* Rebalance is performed from cached_node to hashed_node. Initial cached_node * contains a non-linkto file. After migration it is converted to linkto and * then unlinked. And at hashed_subvolume, first a linkto file is present, @@ -2176,12 +2307,12 @@ err: int dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) { - int ret = 0; - dht_local_t *local = NULL; - xlator_t *hashed_subvol = NULL; - xlator_t *cached_subvol = NULL; - dht_layout_t *layout = NULL; - char gfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + dht_local_t *local = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + dht_layout_t *layout = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; gf_boolean_t found_non_linkto_on_hashed = _gf_false; local = frame->local; @@ -2273,8 +2404,8 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) "unlink on hashed is not skipped %s", local->loc.path); - DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, - NULL, NULL); + DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, + NULL, NULL, NULL, NULL); } return 0; } @@ -2490,14 +2621,23 @@ preset_layout: return 0; } - gf_msg_debug (this->name, 0, - "Creating linkto file on %s(hash) to %s on %s (gfid = %s)", - hashed_subvol->name, local->loc.path, - cached_subvol->name, gfid); + if (frame->root->op != GF_FOP_RENAME) { + local->current = &local->lock[0]; + ret = dht_protect_namespace (frame, &local->loc, hashed_subvol, + &local->current->ns, + dht_call_lookup_linkfile_create); + } else { + gf_msg_debug (this->name, 0, + "Creating linkto file on %s(hash) to %s on %s " + "(gfid = %s)", + hashed_subvol->name, local->loc.path, + cached_subvol->name, gfid); - ret = dht_linkfile_create (frame, - dht_lookup_linkfile_create_cbk, this, - cached_subvol, hashed_subvol, &local->loc); + ret = dht_linkfile_create (frame, + dht_lookup_linkfile_create_cbk, this, + cached_subvol, hashed_subvol, + &local->loc); + } return ret; @@ -2800,6 +2940,7 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, removed, which can take away the namespace, and subvol is anyways down. */ + local->cached_subvol = NULL; if (op_errno != ENOTCONN) goto err; else @@ -8243,7 +8384,7 @@ out: int dht_build_parent_loc (xlator_t *this, loc_t *parent, loc_t *child, - int32_t *op_errno) + int32_t *op_errno) { inode_table_t *table = NULL; int ret = -1; diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 96f7726bc04..999a7874a22 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -169,7 +169,8 @@ typedef enum { typedef enum { REACTION_INVALID, FAIL_ON_ANY_ERROR, - IGNORE_ENOENT_ESTALE + IGNORE_ENOENT_ESTALE, + IGNORE_ENOENT_ESTALE_EIO, } dht_reaction_type_t; struct dht_skip_linkto_unlink { @@ -361,6 +362,10 @@ struct dht_local { dht_dir_transaction_t lock[2], *current; + /* inodelks during filerename for backward compatibility */ + dht_lock_t **rename_inodelk_backward_compatible; + int rename_inodelk_bc_count; + short lock_type; call_stub_t *stub; @@ -379,6 +384,9 @@ struct dht_local { int32_t valid; gf_boolean_t heal_layout; int32_t mds_heal_fresh_lookup; + loc_t loc2_copy; + gf_boolean_t locked; + gf_boolean_t dont_create_linkto; }; typedef struct dht_local dht_local_t; diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index 381856f2455..c301b1fd47e 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -735,6 +735,7 @@ dht_local_wipe (xlator_t *this, dht_local_t *local) loc_wipe (&local->loc); loc_wipe (&local->loc2); + loc_wipe (&local->loc2_copy); if (local->xattr) dict_unref (local->xattr); diff --git a/xlators/cluster/dht/src/dht-lock.c b/xlators/cluster/dht/src/dht-lock.c index 3e82c98bafd..3f389eafa75 100644 --- a/xlators/cluster/dht/src/dht-lock.c +++ b/xlators/cluster/dht/src/dht-lock.c @@ -1015,10 +1015,11 @@ static int32_t dht_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - int lk_index = 0; - int i = 0; - dht_local_t *local = NULL; - char gfid[GF_UUID_BUF_SIZE] = {0,}; + int lk_index = 0; + int i = 0; + dht_local_t *local = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0,}; + dht_reaction_type_t reaction = 0; lk_index = (long) cookie; @@ -1029,8 +1030,9 @@ dht_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, switch (op_errno) { case ESTALE: case ENOENT: - if (local->lock[0].layout.my_layout.locks[lk_index]->do_on_failure - != IGNORE_ENOENT_ESTALE) { + reaction = local->lock[0].layout.my_layout.locks[lk_index]->do_on_failure; + if ((reaction != IGNORE_ENOENT_ESTALE) && + (reaction != IGNORE_ENOENT_ESTALE_EIO)) { gf_uuid_unparse (local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid, gfid); local->lock[0].layout.my_layout.op_ret = -1; local->lock[0].layout.my_layout.op_errno = op_errno; @@ -1042,6 +1044,21 @@ dht_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto cleanup; } break; + case EIO: + reaction = local->lock[0].layout.my_layout.locks[lk_index]->do_on_failure; + if (reaction != IGNORE_ENOENT_ESTALE_EIO) { + gf_uuid_unparse (local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid, gfid); + local->lock[0].layout.my_layout.op_ret = -1; + local->lock[0].layout.my_layout.op_errno = op_errno; + gf_msg (this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_INODELK_FAILED, + "inodelk failed on subvol %s. gfid:%s", + local->lock[0].layout.my_layout.locks[lk_index]->xl->name, + gfid); + goto cleanup; + } + break; + default: gf_uuid_unparse (local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid, gfid); local->lock[0].layout.my_layout.op_ret = -1; diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 91fb94d529c..cc03eaa1195 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -1605,7 +1605,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, struct gf_flock flock = {0, }; struct gf_flock plock = {0, }; loc_t tmp_loc = {0, }; - gf_boolean_t locked = _gf_false; + loc_t parent_loc = {0, }; + gf_boolean_t inodelk_locked = _gf_false; + gf_boolean_t entrylk_locked = _gf_false; gf_boolean_t p_locked = _gf_false; int lk_ret = -1; gf_defrag_info_t *defrag = NULL; @@ -1619,6 +1621,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, gf_boolean_t target_changed = _gf_false; xlator_t *new_target = NULL; xlator_t *old_target = NULL; + xlator_t *hashed_subvol = NULL; fd_t *linkto_fd = NULL; @@ -1686,6 +1689,28 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, " for file: %s", loc->path); } + ret = dht_build_parent_loc (this, &parent_loc, loc, fop_errno); + if (ret < 0) { + ret = -1; + gf_msg (this->name, GF_LOG_WARNING, *fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to build parent loc, which is needed to " + "acquire entrylk to synchronize with renames on this " + "path. Skipping migration", loc->path); + goto out; + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + if (hashed_subvol == NULL) { + ret = -1; + gf_msg (this->name, GF_LOG_WARNING, EINVAL, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: cannot find hashed subvol which is needed to " + "synchronize with renames on this path. " + "Skipping migration", loc->path); + goto out; + } + flock.l_type = F_WRLCK; tmp_loc.inode = inode_ref (loc->inode); @@ -1710,7 +1735,26 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } - locked = _gf_true; + inodelk_locked = _gf_true; + + /* dht_rename has changed to use entrylk on hashed subvol for + * synchronization. So, rebalance too has to acquire an entrylk on + * hashed subvol. + */ + ret = syncop_entrylk (hashed_subvol, DHT_ENTRY_SYNC_DOMAIN, &parent_loc, + loc->name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL, + NULL); + if (ret < 0) { + *fop_errno = -ret; + ret = -1; + gf_msg (this->name, GF_LOG_WARNING, *fop_errno, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to acquire entrylk on subvol %s", + loc->path, hashed_subvol->name); + goto out; + } + + entrylk_locked = _gf_true; /* Phase 1 - Data migration is in progress from now on */ ret = syncop_lookup (from, loc, &stbuf, NULL, dict, &xattr_rsp); @@ -2348,7 +2392,7 @@ out: } } - if (locked) { + if (inodelk_locked) { flock.l_type = F_UNLCK; lk_ret = syncop_inodelk (from, DHT_FILE_MIGRATE_DOMAIN, @@ -2361,6 +2405,18 @@ out: } } + if (entrylk_locked) { + lk_ret = syncop_entrylk (hashed_subvol, DHT_ENTRY_SYNC_DOMAIN, + &parent_loc, loc->name, ENTRYLK_UNLOCK, + ENTRYLK_UNLOCK, NULL, NULL); + if (lk_ret < 0) { + gf_msg (this->name, GF_LOG_WARNING, -lk_ret, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: failed to unlock entrylk on %s", + loc->path, hashed_subvol->name); + } + } + if (p_locked) { plock.l_type = F_UNLCK; lk_ret = syncop_lk (from, src_fd, F_SETLK, &plock, NULL, NULL); @@ -2400,6 +2456,7 @@ out: syncop_close (linkto_fd); loc_wipe (&tmp_loc); + loc_wipe (&parent_loc); return ret; } diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c index 3dc042e3ce5..d311ac633e3 100644 --- a/xlators/cluster/dht/src/dht-rename.c +++ b/xlators/cluster/dht/src/dht-rename.c @@ -18,6 +18,9 @@ #include "defaults.h" int dht_rename_unlock (call_frame_t *frame, xlator_t *this); +int32_t +dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata); int dht_rename_unlock_cbk (call_frame_t *frame, void *cookie, @@ -44,7 +47,7 @@ dht_rename_unlock_cbk (call_frame_t *frame, void *cookie, } static void -dht_rename_unlock_src (call_frame_t *frame, xlator_t *this) +dht_rename_dir_unlock_src (call_frame_t *frame, xlator_t *this) { dht_local_t *local = NULL; @@ -54,7 +57,7 @@ dht_rename_unlock_src (call_frame_t *frame, xlator_t *this) } static void -dht_rename_unlock_dst (call_frame_t *frame, xlator_t *this) +dht_rename_dir_unlock_dst (call_frame_t *frame, xlator_t *this) { dht_local_t *local = NULL; int op_ret = -1; @@ -107,8 +110,8 @@ static int dht_rename_dir_unlock (call_frame_t *frame, xlator_t *this) { - dht_rename_unlock_src (frame, this); - dht_rename_unlock_dst (frame, this); + dht_rename_dir_unlock_src (frame, this); + dht_rename_dir_unlock_dst (frame, this); return 0; } int @@ -721,12 +724,13 @@ dht_rename_unlock (call_frame_t *frame, xlator_t *this) int op_ret = -1; char src_gfid[GF_UUID_BUF_SIZE] = {0}; char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + dht_ilock_wrap_t inodelk_wrapper = {0, }; local = frame->local; - op_ret = dht_unlock_inodelk (frame, - local->lock[0].layout.parent_layout.locks, - local->lock[0].layout.parent_layout.lk_count, - dht_rename_unlock_cbk); + inodelk_wrapper.locks = local->rename_inodelk_backward_compatible; + inodelk_wrapper.lk_count = local->rename_inodelk_bc_count; + + op_ret = dht_unlock_inodelk_wrapper (frame, &inodelk_wrapper); if (op_ret < 0) { uuid_utoa_r (local->loc.inode->gfid, src_gfid); @@ -752,10 +756,13 @@ dht_rename_unlock (call_frame_t *frame, xlator_t *this) "stale locks left on bricks", local->loc.path, src_gfid, local->loc2.path, dst_gfid); - - dht_rename_unlock_cbk (frame, NULL, this, 0, 0, NULL); } + dht_unlock_namespace (frame, &local->lock[0]); + dht_unlock_namespace (frame, &local->lock[1]); + + dht_rename_unlock_cbk (frame, NULL, this, local->op_ret, + local->op_errno, NULL); return 0; } @@ -1470,6 +1477,8 @@ dht_rename_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, char gfid_local[GF_UUID_BUF_SIZE] = {0}; char gfid_server[GF_UUID_BUF_SIZE] = {0}; int child_index = -1; + gf_boolean_t is_src = _gf_false; + loc_t *loc = NULL; child_index = (long)cookie; @@ -1477,22 +1486,98 @@ dht_rename_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; conf = this->private; + is_src = (child_index == 0); + if (is_src) + loc = &local->loc; + else + loc = &local->loc2; + + if (op_ret >= 0) { + if (is_src) + local->src_cached + = dht_subvol_get_cached (this, + local->loc.inode); + else { + if (loc->inode) + gf_uuid_unparse (loc->inode->gfid, gfid_local); + + gf_msg_debug (this->name, 0, + "dst_cached before lookup: %s, " + "(path:%s)(gfid:%s),", + local->loc2.path, + local->dst_cached + ? local->dst_cached->name : + NULL, + local->dst_cached ? gfid_local : NULL); + + local->dst_cached + = dht_subvol_get_cached (this, + local->loc2_copy.inode); + + gf_uuid_unparse (stbuf->ia_gfid, gfid_local); + + gf_msg_debug (this->name, GF_LOG_WARNING, + "dst_cached after lookup: %s, " + "(path:%s)(gfid:%s)", + local->loc2.path, + local->dst_cached + ? local->dst_cached->name : + NULL, + local->dst_cached ? gfid_local : NULL); + + + if ((local->loc2.inode == NULL) + || gf_uuid_compare (stbuf->ia_gfid, + local->loc2.inode->gfid)) { + if (local->loc2.inode != NULL) { + inode_unlink (local->loc2.inode, + local->loc2.parent, + local->loc2.name); + inode_unref (local->loc2.inode); + } + + local->loc2.inode + = inode_link (local->loc2_copy.inode, + local->loc2_copy.parent, + local->loc2_copy.name, + stbuf); + gf_uuid_copy (local->loc2.gfid, + stbuf->ia_gfid); + } + } + } + if (op_ret < 0) { - /* The meaning of is_linkfile is overloaded here. For locking - * to work properly both rebalance and rename should acquire - * lock on datafile. The reason for sending this lookup is to - * find out whether we've acquired a lock on data file. - * Between the lookup before rename and this rename, the - * file could be migrated by a rebalance process and now this - * file this might be a linkto file. We verify that by sending - * this lookup. However, if this lookup fails we cannot really - * say whether we've acquired lock on a datafile or linkto file. - * So, we act conservatively and _assume_ - * that this is a linkfile and fail the rename operation. - */ - local->is_linkfile = _gf_true; - local->op_errno = op_errno; - } else if (xattr && check_is_linkfile (inode, stbuf, xattr, + if (is_src) { + /* The meaning of is_linkfile is overloaded here. For locking + * to work properly both rebalance and rename should acquire + * lock on datafile. The reason for sending this lookup is to + * find out whether we've acquired a lock on data file. + * Between the lookup before rename and this rename, the + * file could be migrated by a rebalance process and now this + * file this might be a linkto file. We verify that by sending + * this lookup. However, if this lookup fails we cannot really + * say whether we've acquired lock on a datafile or linkto file. + * So, we act conservatively and _assume_ + * that this is a linkfile and fail the rename operation. + */ + local->is_linkfile = _gf_true; + local->op_errno = op_errno; + } else { + if (local->dst_cached) + gf_msg_debug (this->name, op_errno, + "file %s (gfid:%s) was present " + "(hashed-subvol=%s, " + "cached-subvol=%s) before rename," + " but lookup failed", + local->loc2.path, + uuid_utoa (local->loc2.inode->gfid), + local->dst_hashed->name, + local->dst_cached->name); + if (dht_inode_missing (op_errno)) + local->dst_cached = NULL; + } + } else if (is_src && xattr && check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) { local->is_linkfile = _gf_true; /* Found linkto file instead of data file, passdown ENOENT @@ -1500,11 +1585,9 @@ dht_rename_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = ENOENT; } - if (!local->is_linkfile && - gf_uuid_compare (local->lock[0].layout.parent_layout.locks[child_index]->loc.gfid, - stbuf->ia_gfid)) { - gf_uuid_unparse (local->lock[0].layout.parent_layout.locks[child_index]->loc.gfid, - gfid_local); + if (!local->is_linkfile && (op_ret >= 0) && + gf_uuid_compare (loc->gfid, stbuf->ia_gfid)) { + gf_uuid_unparse (loc->gfid, gfid_local); gf_uuid_unparse (stbuf->ia_gfid, gfid_server); gf_msg (this->name, GF_LOG_WARNING, 0, @@ -1537,6 +1620,123 @@ fail: return 0; } +int +dht_rename_file_lock1_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + loc_t *loc = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + if (op_ret < 0) { + uuid_utoa_r (local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r (local->loc2.inode->gfid, dst_gfid); + + gf_msg (this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_INODE_LK_ERROR, + "protecting namespace of %s failed" + "rename (%s:%s:%s %s:%s:%s)", + local->current == &local->lock[0] ? local->loc.path + : local->loc2.path, + local->loc.path, src_gfid, local->src_hashed->name, + local->loc2.path, dst_gfid, + local->dst_hashed ? local->dst_hashed->name : NULL); + + local->op_ret = -1; + local->op_errno = op_errno; + goto err; + } + + if (local->current == &local->lock[0]) { + loc = &local->loc2; + subvol = local->dst_hashed; + local->current = &local->lock[1]; + } else { + loc = &local->loc; + subvol = local->src_hashed; + local->current = &local->lock[0]; + } + + ret = dht_protect_namespace (frame, loc, subvol, &local->current->ns, + dht_rename_lock_cbk); + if (ret < 0) { + op_errno = EINVAL; + goto err; + } + + return 0; +err: + /* No harm in calling an extra unlock */ + dht_rename_unlock (frame, this); + return 0; +} + +int32_t +dht_rename_file_protect_namespace (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + char src_gfid[GF_UUID_BUF_SIZE] = {0}; + char dst_gfid[GF_UUID_BUF_SIZE] = {0}; + int ret = 0; + loc_t *loc = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + if (op_ret < 0) { + uuid_utoa_r (local->loc.inode->gfid, src_gfid); + + if (local->loc2.inode) + uuid_utoa_r (local->loc2.inode->gfid, dst_gfid); + + gf_msg (this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_INODE_LK_ERROR, + "acquiring inodelk failed " + "rename (%s:%s:%s %s:%s:%s)", + local->loc.path, src_gfid, local->src_cached->name, + local->loc2.path, dst_gfid, + local->dst_cached ? local->dst_cached->name : NULL); + + local->op_ret = -1; + local->op_errno = op_errno; + + goto err; + } + + /* Locks on src and dst needs to ordered which otherwise might cause + * deadlocks when rename (src, dst) and rename (dst, src) is done from + * two different clients + */ + dht_order_rename_lock (frame, &loc, &subvol); + + ret = dht_protect_namespace (frame, loc, subvol, + &local->current->ns, + dht_rename_file_lock1_cbk); + if (ret < 0) { + op_errno = EINVAL; + goto err; + } + + return 0; + +err: + /* Its fine to call unlock even when no locks are acquired, as we check + * for lock->locked before winding a unlock call. + */ + dht_rename_unlock (frame, this); + + return 0; +} + int32_t dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) @@ -1547,8 +1747,8 @@ dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dict_t *xattr_req = NULL; dht_conf_t *conf = NULL; int i = 0; - int count = 0; - + xlator_t *subvol = NULL; + dht_lock_t *lock = NULL; local = frame->local; conf = this->private; @@ -1561,11 +1761,13 @@ dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_msg (this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR, - "acquiring inodelk failed " + "protecting namespace of %s failed. " "rename (%s:%s:%s %s:%s:%s)", - local->loc.path, src_gfid, local->src_cached->name, + local->current == &local->lock[0] ? local->loc.path + : local->loc2.path, + local->loc.path, src_gfid, local->src_hashed->name, local->loc2.path, dst_gfid, - local->dst_cached ? local->dst_cached->name : NULL); + local->dst_hashed ? local->dst_hashed->name : NULL); local->op_ret = -1; local->op_errno = op_errno; @@ -1588,7 +1790,19 @@ dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto done; } - count = local->call_cnt = local->lock[0].layout.parent_layout.lk_count; + /* dst_cached might've changed. This normally happens for two reasons: + * 1. rebalance migrated dst + * 2. Another parallel rename was done overwriting dst + * + * Doing a lookup on local->loc2 when dst exists, but is associated + * with a different gfid will result in an ESTALE error. So, do a fresh + * lookup with a new inode on dst-path and handle change of dst-cached + * in the cbk. Also, to identify dst-cached changes we do a lookup on + * "this" rather than the subvol. + */ + loc_copy (&local->loc2_copy, &local->loc2); + inode_unref (local->loc2_copy.inode); + local->loc2_copy.inode = inode_new (local->loc.inode->table); /* Why not use local->lock.locks[?].loc for lookup post lock phase * --------------------------------------------------------------- @@ -1608,13 +1822,26 @@ dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * exists with the name that the client requested with. * */ - for (i = 0; i < count; i++) { - STACK_WIND_COOKIE (frame, dht_rename_lookup_cbk, (void *)(long)i, - local->lock[0].layout.parent_layout.locks[i]->xl, - local->lock[0].layout.parent_layout.locks[i]->xl->fops->lookup, - ((gf_uuid_compare (local->loc.gfid, \ - local->lock[0].layout.parent_layout.locks[i]->loc.gfid) == 0) ? - &local->loc : &local->loc2), xattr_req); + local->call_cnt = 2; + for (i = 0; i < 2; i++) { + if (i == 0) { + lock = local->rename_inodelk_backward_compatible[0]; + if (gf_uuid_compare (local->loc.gfid, + lock->loc.gfid) == 0) + subvol = lock->xl; + else { + lock = local->rename_inodelk_backward_compatible[1]; + subvol = lock->xl; + } + } else { + subvol = this; + } + + STACK_WIND_COOKIE (frame, dht_rename_lookup_cbk, + (void *)(long)i, subvol, + subvol->fops->lookup, + (i == 0) ? &local->loc : &local->loc2_copy, + xattr_req); } dict_unref (xattr_req); @@ -1644,7 +1871,8 @@ dht_rename_lock (call_frame_t *frame) if (local->dst_cached) count++; - lk_array = GF_CALLOC (count, sizeof (*lk_array), gf_common_mt_pointer); + lk_array = GF_CALLOC (count, sizeof (*lk_array), + gf_common_mt_pointer); if (lk_array == NULL) goto err; @@ -1655,22 +1883,40 @@ dht_rename_lock (call_frame_t *frame) goto err; if (local->dst_cached) { + /* dst might be removed by the time inodelk reaches bricks, + * which can result in ESTALE errors. POSIX imposes no + * restriction for dst to be present for renames to be + * successful. So, we'll ignore ESTALE errors. As far as + * synchronization on dst goes, we'll achieve the same by + * holding entrylk on parent directory of dst in the namespace + * of basename(dst). Also, there might not be quorum in cluster + * xlators like EC/disperse on errno, in which case they return + * EIO. For eg., in a disperse (4 + 2), 3 might return success + * and three might return ESTALE. Disperse, having no Quorum + * unwinds inodelk with EIO. So, ignore EIO too. + */ lk_array[1] = dht_lock_new (frame->this, local->dst_cached, &local->loc2, F_WRLCK, DHT_FILE_MIGRATE_DOMAIN, NULL, - FAIL_ON_ANY_ERROR); + IGNORE_ENOENT_ESTALE_EIO); if (lk_array[1] == NULL) goto err; } - local->lock[0].layout.parent_layout.locks = lk_array; - local->lock[0].layout.parent_layout.lk_count = count; + local->rename_inodelk_backward_compatible = lk_array; + local->rename_inodelk_bc_count = count; + /* retaining inodelks for the sake of backward compatibility. Please + * make sure to remove this inodelk once all of 3.10, 3.12 and 3.13 + * reach EOL. Better way of getting synchronization would be to acquire + * entrylks on src and dst parent directories in the namespace of + * basenames of src and dst + */ ret = dht_blocking_inodelk (frame, lk_array, count, - dht_rename_lock_cbk); + dht_rename_file_protect_namespace); if (ret < 0) { - local->lock[0].layout.parent_layout.locks = NULL; - local->lock[0].layout.parent_layout.lk_count = 0; + local->rename_inodelk_backward_compatible = NULL; + local->rename_inodelk_bc_count = 0; goto err; } @@ -1701,6 +1947,7 @@ dht_rename (call_frame_t *frame, xlator_t *this, dht_local_t *local = NULL; dht_conf_t *conf = NULL; char gfid[GF_UUID_BUF_SIZE] = {0}; + char newgfid[GF_UUID_BUF_SIZE] = {0}; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -1772,11 +2019,15 @@ dht_rename (call_frame_t *frame, xlator_t *this, if (xdata) local->xattr_req = dict_ref (xdata); + if (newloc->inode) + gf_uuid_unparse(newloc->inode->gfid, newgfid); + gf_msg (this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_INFO, - "renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)", - oldloc->path, src_hashed->name, src_cached->name, - newloc->path, dst_hashed->name, + "renaming %s (%s) (hash=%s/cache=%s) => %s (%s) " + "(hash=%s/cache=%s) ", + oldloc->path, gfid, src_hashed->name, src_cached->name, + newloc->path, newloc->inode ? newgfid : NULL, dst_hashed->name, dst_cached ? dst_cached->name : "<nul>"); if (IA_ISDIR (oldloc->inode->ia_type)) { @@ -1784,8 +2035,10 @@ dht_rename (call_frame_t *frame, xlator_t *this, } else { local->op_ret = 0; ret = dht_rename_lock (frame); - if (ret < 0) + if (ret < 0) { + op_errno = ENOMEM; goto err; + } } return 0; |