diff options
| -rw-r--r-- | tests/bugs/distribute/bug-1543279.t | 65 | ||||
| -rw-r--r-- | tests/include.rc | 3 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 175 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 10 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-helper.c | 1 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-lock.c | 29 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 63 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rename.c | 361 | 
8 files changed, 625 insertions, 82 deletions
diff --git a/tests/bugs/distribute/bug-1543279.t b/tests/bugs/distribute/bug-1543279.t new file mode 100644 index 00000000000..67cc0f50c2f --- /dev/null +++ b/tests/bugs/distribute/bug-1543279.t @@ -0,0 +1,65 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../dht.rc + +TESTS_EXPECTED_IN_LOOP=44 +SCRIPT_TIMEOUT=600 + +rename_files() { +    MOUNT=$1 +    ITERATIONS=$2 +    for i in $(seq 1 $ITERATIONS); do uuid="`uuidgen`"; echo "some data" > $MOUNT/test$uuid; mv $MOUNT/test$uuid $MOUNT/test -f || return $?; done +} + +run_test_for_volume() { +    VOLUME=$1 +    ITERATIONS=$2 +    TEST_IN_LOOP $CLI volume start $VOLUME + +    TEST_IN_LOOP glusterfs -s $H0 --volfile-id $VOLUME $M0 +    TEST_IN_LOOP glusterfs -s $H0 --volfile-id $VOLUME $M1 +    TEST_IN_LOOP glusterfs -s $H0 --volfile-id $VOLUME $M2 +    TEST_IN_LOOP glusterfs -s $H0 --volfile-id $VOLUME $M3 + +    rename_files $M0 $ITERATIONS & +    M0_RENAME_PID=$! + +    rename_files $M1 $ITERATIONS & +    M1_RENAME_PID=$! + +    rename_files $M2 $ITERATIONS & +    M2_RENAME_PID=$! + +    rename_files $M3 $ITERATIONS & +    M3_RENAME_PID=$! + +    TEST_IN_LOOP wait $M0_RENAME_PID +    TEST_IN_LOOP wait $M1_RENAME_PID +    TEST_IN_LOOP wait $M2_RENAME_PID +    TEST_IN_LOOP wait $M3_RENAME_PID + +    TEST_IN_LOOP $CLI volume stop $VOLUME +    TEST_IN_LOOP $CLI volume delete $VOLUME +    umount $M0 $M1 $M2 $M3 +} + +cleanup + +TEST glusterd +TEST pidof glusterd + +TEST $CLI volume create $V0 $H0:$B0/${V0}{0..8} force +run_test_for_volume $V0 200 + +TEST $CLI volume create $V0 replica 3 arbiter 1 $H0:$B0/${V0}{0..8} force +run_test_for_volume $V0 200 + +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0..8} force +run_test_for_volume $V0 200 + +TEST $CLI volume create $V0 disperse 6 redundancy 2 $H0:$B0/${V0}{0..5} force +run_test_for_volume $V0 200 + +cleanup diff --git a/tests/include.rc b/tests/include.rc index f02bba1442b..5a99e027df8 100644 --- a/tests/include.rc +++ b/tests/include.rc @@ -1,6 +1,7 @@  M0=${M0:=/mnt/glusterfs/0};   # 0th mount point for FUSE  M1=${M1:=/mnt/glusterfs/1};   # 1st mount point for FUSE  M2=${M2:=/mnt/glusterfs/2};   # 2nd mount point for FUSE +M3=${M3:=/mnt/glusterfs/3};   # 3rd mount point for FUSE  N0=${N0:=/mnt/nfs/0};         # 0th mount point for NFS  N1=${N1:=/mnt/nfs/1};         # 1st mount point for NFS  V0=${V0:=patchy};             # volume name to use in tests @@ -8,7 +9,7 @@ V1=${V1:=patchy1};            # volume name to use in tests  GMV0=${GMV0:=master};	      # master volume name to use in geo-rep tests  GSV0=${GSV0:=slave};	      # slave volume name to use in geo-rep tests  B0=${B0:=/d/backends};        # top level of brick directories -WORKDIRS="$B0 $M0 $M1 $M2 $N0 $N1" +WORKDIRS="$B0 $M0 $M1 $M2 $M3 $N0 $N1"  ROOT_GFID="00000000-0000-0000-0000-000000000001"  DOT_SHARD_GFID="be318638-e8a0-4c6d-977d-7a937aa84806" diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 3492e7299ed..7a3f7eda935 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -1931,7 +1931,6 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,          GF_VALIDATE_OR_GOTO ("dht", this, out);          GF_VALIDATE_OR_GOTO ("dht", frame->local, out);          GF_VALIDATE_OR_GOTO ("dht", this->private, out); -        GF_VALIDATE_OR_GOTO ("dht", cookie, out);          local = frame->local;          cached_subvol = local->cached_subvol; @@ -1939,6 +1938,9 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,          gf_uuid_unparse(local->loc.gfid, gfid); +        if (local->locked) +                dht_unlock_namespace (frame, &local->lock[0]); +          ret = dht_layout_preset (this, local->cached_subvol, local->loc.inode);          if (ret < 0) {                  gf_msg_debug (this->name, EINVAL, @@ -1962,6 +1964,7 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,                                             postparent, 1);          } +  unwind:          gf_msg_debug (this->name, 0,                        "creation of linkto on hashed subvol:%s, " @@ -2133,6 +2136,134 @@ err:          return -1;  } + +int32_t +dht_linkfile_create_lookup_cbk (call_frame_t *frame, void *cookie, +                                xlator_t *this, int32_t op_ret, +                                int32_t op_errno, inode_t *inode, +                                struct iatt *buf, dict_t *xdata, +                                struct iatt *postparent) +{ +        dht_local_t *local                      = NULL; +        int          call_cnt                   = 0, ret = 0; +        xlator_t    *subvol                     = NULL; +        uuid_t       gfid                       = {0, }; +        char         gfid_str[GF_UUID_BUF_SIZE] = {0}; + +        subvol = cookie; +        local = frame->local; + +        if (subvol == local->hashed_subvol) { +                if ((op_ret == 0) || (op_errno != ENOENT)) +                        local->dont_create_linkto = _gf_true; +        } else { +                if (gf_uuid_is_null (local->gfid)) +                        gf_uuid_copy (gfid, local->loc.gfid); +                else +                        gf_uuid_copy (gfid, local->gfid); + +                if ((op_ret == 0) && gf_uuid_compare (gfid, buf->ia_gfid)) { +                        gf_uuid_unparse (gfid, gfid_str); +                        gf_msg_debug (this->name, 0, +                                      "gfid (%s) different on cached subvol " +                                      "(%s) and looked up inode (%s), not " +                                      "creating linkto", +                                      uuid_utoa (buf->ia_gfid), subvol->name, +                                      gfid_str); +                        local->dont_create_linkto = _gf_true; +                } else if (op_ret == -1) { +                        local->dont_create_linkto = _gf_true; +                } +        } + +        call_cnt = dht_frame_return (frame); +        if (is_last_call (call_cnt)) { +                if (local->dont_create_linkto) +                        goto no_linkto; +                else { +                        gf_msg_debug (this->name, 0, +                                      "Creating linkto file on %s(hash) to " +                                      "%s on %s (gfid = %s)", +                                      local->hashed_subvol->name, +                                      local->loc.path, +                                      local->cached_subvol->name, gfid); + +                        ret = dht_linkfile_create +                                (frame, dht_lookup_linkfile_create_cbk, +                                 this, local->cached_subvol, +                                 local->hashed_subvol, &local->loc); + +                        if (ret < 0) +                                goto no_linkto; +                } +        } + +        return 0; + +no_linkto: +        gf_msg_debug (this->name, 0, +                      "skipped linkto creation (path:%s) (gfid:%s) " +                      "(hashed-subvol:%s) (cached-subvol:%s)", +                      local->loc.path, gfid_str, local->hashed_subvol->name, +                      local->cached_subvol->name); + +        dht_lookup_linkfile_create_cbk (frame, NULL, this, 0, 0, +                                        local->loc.inode, &local->stbuf, +                                        &local->preparent, &local->postparent, +                                        local->xattr); +        return 0; +} + + +int32_t +dht_call_lookup_linkfile_create (call_frame_t *frame, void *cookie, +                                 xlator_t *this, int32_t op_ret, +                                 int32_t op_errno, dict_t *xdata) +{ +        dht_local_t *local          = NULL; +        char gfid[GF_UUID_BUF_SIZE] = {0}; +        int          i              = 0; +        xlator_t    *subvol         = NULL; + +        local = frame->local; +        if (gf_uuid_is_null (local->gfid)) +                gf_uuid_unparse (local->loc.gfid, gfid); +        else +                gf_uuid_unparse (local->gfid, gfid); + +        if (op_ret < 0) { +                gf_log (this->name, GF_LOG_WARNING, +                        "protecting namespace failed, skipping linkto " +                        "creation (path:%s)(gfid:%s)(hashed-subvol:%s)" +                        "(cached-subvol:%s)", local->loc.path, gfid, +                        local->hashed_subvol->name, local->cached_subvol->name); +                goto err; +        } + +        local->locked = _gf_true; + + +        local->call_cnt = 2; + +        for (i = 0; i < 2; i++) { +                subvol = (subvol == NULL) ? local->hashed_subvol +                        : local->cached_subvol; + +                STACK_WIND_COOKIE (frame, dht_linkfile_create_lookup_cbk, +                                   subvol, subvol, subvol->fops->lookup, +                                   &local->loc, NULL); +        } + +        return 0; + +err: +        dht_lookup_linkfile_create_cbk (frame, NULL, this, 0, 0, +                                        local->loc.inode, +                                        &local->stbuf, &local->preparent, +                                        &local->postparent, local->xattr); +        return 0; +} +  /* Rebalance is performed from cached_node to hashed_node. Initial cached_node   * contains a non-linkto file. After migration it is converted to linkto and   * then unlinked. And at hashed_subvolume, first a linkto file is present, @@ -2176,12 +2307,12 @@ err:  int  dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)  { -        int           ret = 0; -        dht_local_t  *local = NULL; -        xlator_t     *hashed_subvol = NULL; -        xlator_t     *cached_subvol = NULL; -        dht_layout_t *layout = NULL; -        char gfid[GF_UUID_BUF_SIZE] = {0}; +        int           ret                        = 0; +        dht_local_t  *local                      = NULL; +        xlator_t     *hashed_subvol              = NULL; +        xlator_t     *cached_subvol              = NULL; +        dht_layout_t *layout                     = NULL; +        char gfid[GF_UUID_BUF_SIZE]              = {0};          gf_boolean_t  found_non_linkto_on_hashed = _gf_false;          local = frame->local; @@ -2273,8 +2404,8 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)                                        "unlink on hashed is not skipped %s",                                        local->loc.path); -                        DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, -                                          NULL, NULL); +                        DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, +                                          NULL, NULL, NULL, NULL);                  }                  return 0;          } @@ -2490,14 +2621,23 @@ preset_layout:                  return 0;          } -        gf_msg_debug (this->name, 0, -                      "Creating linkto file on %s(hash) to %s on %s (gfid = %s)", -                      hashed_subvol->name, local->loc.path, -                      cached_subvol->name, gfid); +        if (frame->root->op != GF_FOP_RENAME) { +                local->current = &local->lock[0]; +                ret = dht_protect_namespace (frame, &local->loc, hashed_subvol, +                                             &local->current->ns, +                                             dht_call_lookup_linkfile_create); +        } else { +                gf_msg_debug (this->name, 0, +                              "Creating linkto file on %s(hash) to %s on %s " +                              "(gfid = %s)", +                              hashed_subvol->name, local->loc.path, +                              cached_subvol->name, gfid); -        ret = dht_linkfile_create (frame, -                                   dht_lookup_linkfile_create_cbk, this, -                                   cached_subvol, hashed_subvol, &local->loc); +                ret = dht_linkfile_create (frame, +                                           dht_lookup_linkfile_create_cbk, this, +                                           cached_subvol, hashed_subvol, +                                           &local->loc); +        }          return ret; @@ -2800,6 +2940,7 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,                  removed, which can take away the namespace, and subvol is                  anyways down. */ +                local->cached_subvol = NULL;                  if (op_errno != ENOTCONN)                          goto err;                  else @@ -8243,7 +8384,7 @@ out:  int  dht_build_parent_loc (xlator_t *this, loc_t *parent, loc_t *child, -                                                 int32_t *op_errno) +                      int32_t *op_errno)  {          inode_table_t   *table = NULL;          int     ret = -1; diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 96f7726bc04..999a7874a22 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -169,7 +169,8 @@ typedef enum {  typedef enum {          REACTION_INVALID,          FAIL_ON_ANY_ERROR, -        IGNORE_ENOENT_ESTALE +        IGNORE_ENOENT_ESTALE, +        IGNORE_ENOENT_ESTALE_EIO,  } dht_reaction_type_t;  struct dht_skip_linkto_unlink { @@ -361,6 +362,10 @@ struct dht_local {          dht_dir_transaction_t lock[2], *current; +        /* inodelks during filerename for backward compatibility */ +        dht_lock_t           **rename_inodelk_backward_compatible; +        int                    rename_inodelk_bc_count; +          short           lock_type;          call_stub_t *stub; @@ -379,6 +384,9 @@ struct dht_local {          int32_t valid;          gf_boolean_t heal_layout;          int32_t mds_heal_fresh_lookup; +        loc_t        loc2_copy; +        gf_boolean_t locked; +        gf_boolean_t dont_create_linkto;  };  typedef struct dht_local dht_local_t; diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index 381856f2455..c301b1fd47e 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -735,6 +735,7 @@ dht_local_wipe (xlator_t *this, dht_local_t *local)          loc_wipe (&local->loc);          loc_wipe (&local->loc2); +        loc_wipe (&local->loc2_copy);          if (local->xattr)                  dict_unref (local->xattr); diff --git a/xlators/cluster/dht/src/dht-lock.c b/xlators/cluster/dht/src/dht-lock.c index 3e82c98bafd..3f389eafa75 100644 --- a/xlators/cluster/dht/src/dht-lock.c +++ b/xlators/cluster/dht/src/dht-lock.c @@ -1015,10 +1015,11 @@ static int32_t  dht_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                            int32_t op_ret, int32_t op_errno, dict_t *xdata)  { -        int          lk_index                   = 0; -        int          i                          = 0; -        dht_local_t *local                      = NULL; -        char         gfid[GF_UUID_BUF_SIZE]     = {0,}; +        int                  lk_index       = 0; +        int                  i              = 0; +        dht_local_t         *local          = NULL; +        char         gfid[GF_UUID_BUF_SIZE] = {0,}; +        dht_reaction_type_t  reaction       = 0;          lk_index = (long) cookie; @@ -1029,8 +1030,9 @@ dht_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  switch (op_errno) {                  case ESTALE:                  case ENOENT: -                        if (local->lock[0].layout.my_layout.locks[lk_index]->do_on_failure -                            != IGNORE_ENOENT_ESTALE) { +                        reaction = local->lock[0].layout.my_layout.locks[lk_index]->do_on_failure; +                        if ((reaction != IGNORE_ENOENT_ESTALE) && +                            (reaction != IGNORE_ENOENT_ESTALE_EIO)) {                                  gf_uuid_unparse (local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid, gfid);                                  local->lock[0].layout.my_layout.op_ret = -1;                                  local->lock[0].layout.my_layout.op_errno = op_errno; @@ -1042,6 +1044,21 @@ dht_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                                  goto cleanup;                          }                          break; +                case EIO: +                        reaction = local->lock[0].layout.my_layout.locks[lk_index]->do_on_failure; +                        if (reaction != IGNORE_ENOENT_ESTALE_EIO) { +                                gf_uuid_unparse (local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid, gfid); +                                local->lock[0].layout.my_layout.op_ret = -1; +                                local->lock[0].layout.my_layout.op_errno = op_errno; +                                gf_msg (this->name, GF_LOG_ERROR, op_errno, +                                        DHT_MSG_INODELK_FAILED, +                                        "inodelk failed on subvol %s. gfid:%s", +                                        local->lock[0].layout.my_layout.locks[lk_index]->xl->name, +                                        gfid); +                                goto cleanup; +                        } +                        break; +                  default:                          gf_uuid_unparse (local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid, gfid);                          local->lock[0].layout.my_layout.op_ret = -1; diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 91fb94d529c..cc03eaa1195 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -1605,7 +1605,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,          struct gf_flock         flock                   = {0, };          struct gf_flock         plock                   = {0, };          loc_t                   tmp_loc                 = {0, }; -        gf_boolean_t            locked                  = _gf_false; +        loc_t                   parent_loc              = {0, }; +        gf_boolean_t            inodelk_locked          = _gf_false; +        gf_boolean_t            entrylk_locked          = _gf_false;          gf_boolean_t            p_locked                = _gf_false;          int                     lk_ret                  = -1;          gf_defrag_info_t        *defrag                 =  NULL; @@ -1619,6 +1621,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,          gf_boolean_t            target_changed          = _gf_false;          xlator_t                *new_target             = NULL;          xlator_t                *old_target             = NULL; +        xlator_t                *hashed_subvol          = NULL;          fd_t                    *linkto_fd              = NULL; @@ -1686,6 +1689,28 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,                          " for file: %s", loc->path);          } +        ret = dht_build_parent_loc (this, &parent_loc, loc, fop_errno); +        if (ret < 0) { +                ret = -1; +                gf_msg (this->name, GF_LOG_WARNING, *fop_errno, +                        DHT_MSG_MIGRATE_FILE_FAILED, +                        "%s: failed to build parent loc, which is needed to " +                        "acquire entrylk to synchronize with renames on this " +                        "path. Skipping migration", loc->path); +                goto out; +        } + +        hashed_subvol = dht_subvol_get_hashed (this, loc); +        if (hashed_subvol == NULL) { +                ret = -1; +                gf_msg (this->name, GF_LOG_WARNING, EINVAL, +                        DHT_MSG_MIGRATE_FILE_FAILED, +                        "%s: cannot find hashed subvol which is needed to " +                        "synchronize with renames on this path. " +                        "Skipping migration", loc->path); +                goto out; +        } +          flock.l_type = F_WRLCK;          tmp_loc.inode = inode_ref (loc->inode); @@ -1710,7 +1735,26 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,                  goto out;          } -        locked = _gf_true; +        inodelk_locked = _gf_true; + +        /* dht_rename has changed to use entrylk on hashed subvol for +         * synchronization. So, rebalance too has to acquire an entrylk on +         * hashed subvol. +         */ +        ret = syncop_entrylk (hashed_subvol, DHT_ENTRY_SYNC_DOMAIN, &parent_loc, +                              loc->name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL, +                              NULL); +        if (ret < 0) { +                *fop_errno = -ret; +                ret = -1; +                gf_msg (this->name, GF_LOG_WARNING, *fop_errno, +                        DHT_MSG_MIGRATE_FILE_FAILED, +                        "%s: failed to acquire entrylk on subvol %s", +                        loc->path, hashed_subvol->name); +                goto out; +        } + +        entrylk_locked = _gf_true;          /* Phase 1 - Data migration is in progress from now on */          ret = syncop_lookup (from, loc, &stbuf, NULL, dict, &xattr_rsp); @@ -2348,7 +2392,7 @@ out:                  }          } -        if (locked) { +        if (inodelk_locked) {                  flock.l_type = F_UNLCK;                  lk_ret = syncop_inodelk (from, DHT_FILE_MIGRATE_DOMAIN, @@ -2361,6 +2405,18 @@ out:                  }          } +        if (entrylk_locked) { +                lk_ret = syncop_entrylk (hashed_subvol, DHT_ENTRY_SYNC_DOMAIN, +                                         &parent_loc, loc->name, ENTRYLK_UNLOCK, +                                         ENTRYLK_UNLOCK, NULL, NULL); +                if (lk_ret < 0) { +                        gf_msg (this->name, GF_LOG_WARNING, -lk_ret, +                                DHT_MSG_MIGRATE_FILE_FAILED, +                                "%s: failed to unlock entrylk on %s", +                                loc->path, hashed_subvol->name); +                } +        } +          if (p_locked) {                  plock.l_type = F_UNLCK;                  lk_ret = syncop_lk (from, src_fd, F_SETLK, &plock, NULL, NULL); @@ -2400,6 +2456,7 @@ out:                  syncop_close (linkto_fd);          loc_wipe (&tmp_loc); +        loc_wipe (&parent_loc);          return ret;  } diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c index 3dc042e3ce5..d311ac633e3 100644 --- a/xlators/cluster/dht/src/dht-rename.c +++ b/xlators/cluster/dht/src/dht-rename.c @@ -18,6 +18,9 @@  #include "defaults.h"  int dht_rename_unlock (call_frame_t *frame, xlator_t *this); +int32_t +dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                     int32_t op_ret, int32_t op_errno, dict_t *xdata);  int  dht_rename_unlock_cbk (call_frame_t *frame, void *cookie, @@ -44,7 +47,7 @@ dht_rename_unlock_cbk (call_frame_t *frame, void *cookie,  }  static void -dht_rename_unlock_src (call_frame_t *frame, xlator_t *this) +dht_rename_dir_unlock_src (call_frame_t *frame, xlator_t *this)  {          dht_local_t *local                      = NULL; @@ -54,7 +57,7 @@ dht_rename_unlock_src (call_frame_t *frame, xlator_t *this)  }  static void -dht_rename_unlock_dst (call_frame_t *frame, xlator_t *this) +dht_rename_dir_unlock_dst (call_frame_t *frame, xlator_t *this)  {          dht_local_t *local                      = NULL;          int          op_ret                     = -1; @@ -107,8 +110,8 @@ static int  dht_rename_dir_unlock (call_frame_t *frame, xlator_t *this)  { -        dht_rename_unlock_src (frame, this); -        dht_rename_unlock_dst (frame, this); +        dht_rename_dir_unlock_src (frame, this); +        dht_rename_dir_unlock_dst (frame, this);          return 0;  }  int @@ -721,12 +724,13 @@ dht_rename_unlock (call_frame_t *frame, xlator_t *this)          int          op_ret                     = -1;          char         src_gfid[GF_UUID_BUF_SIZE] = {0};          char         dst_gfid[GF_UUID_BUF_SIZE] = {0}; +        dht_ilock_wrap_t inodelk_wrapper        = {0, };          local = frame->local; -        op_ret = dht_unlock_inodelk (frame, -                                     local->lock[0].layout.parent_layout.locks, -                                     local->lock[0].layout.parent_layout.lk_count, -                                     dht_rename_unlock_cbk); +        inodelk_wrapper.locks = local->rename_inodelk_backward_compatible; +        inodelk_wrapper.lk_count = local->rename_inodelk_bc_count; + +        op_ret = dht_unlock_inodelk_wrapper (frame, &inodelk_wrapper);          if (op_ret < 0) {                  uuid_utoa_r (local->loc.inode->gfid, src_gfid); @@ -752,10 +756,13 @@ dht_rename_unlock (call_frame_t *frame, xlator_t *this)                                  "stale locks left on bricks",                                  local->loc.path, src_gfid,                                  local->loc2.path, dst_gfid); - -                dht_rename_unlock_cbk (frame, NULL, this, 0, 0, NULL);          } +        dht_unlock_namespace (frame, &local->lock[0]); +        dht_unlock_namespace (frame, &local->lock[1]); + +        dht_rename_unlock_cbk (frame, NULL, this, local->op_ret, +                               local->op_errno, NULL);          return 0;  } @@ -1470,6 +1477,8 @@ dht_rename_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          char         gfid_local[GF_UUID_BUF_SIZE]       = {0};          char         gfid_server[GF_UUID_BUF_SIZE]      = {0};          int          child_index                        = -1; +        gf_boolean_t is_src                             = _gf_false; +        loc_t       *loc                                = NULL;          child_index = (long)cookie; @@ -1477,22 +1486,98 @@ dht_rename_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          local = frame->local;          conf = this->private; +        is_src = (child_index == 0); +        if (is_src) +                loc = &local->loc; +        else +                loc = &local->loc2; + +        if (op_ret >= 0) { +                if (is_src) +                        local->src_cached +                                = dht_subvol_get_cached (this, +                                                         local->loc.inode); +                else { +                        if (loc->inode) +                                gf_uuid_unparse (loc->inode->gfid, gfid_local); + +                        gf_msg_debug (this->name, 0, +                                      "dst_cached before lookup: %s, " +                                      "(path:%s)(gfid:%s),", +                                      local->loc2.path, +                                      local->dst_cached +                                      ? local->dst_cached->name : +                                      NULL, +                                      local->dst_cached ? gfid_local : NULL); + +                        local->dst_cached +                                = dht_subvol_get_cached (this, +                                                         local->loc2_copy.inode); + +                        gf_uuid_unparse (stbuf->ia_gfid, gfid_local); + +                        gf_msg_debug (this->name, GF_LOG_WARNING, +                                      "dst_cached after lookup: %s, " +                                      "(path:%s)(gfid:%s)", +                                      local->loc2.path, +                                      local->dst_cached +                                      ? local->dst_cached->name : +                                      NULL, +                                      local->dst_cached ? gfid_local : NULL); + + +                        if ((local->loc2.inode == NULL) +                            || gf_uuid_compare (stbuf->ia_gfid, +                                                local->loc2.inode->gfid)) { +                                if (local->loc2.inode != NULL) { +                                        inode_unlink (local->loc2.inode, +                                                      local->loc2.parent, +                                                      local->loc2.name); +                                        inode_unref (local->loc2.inode); +                                } + +                                local->loc2.inode +                                        = inode_link (local->loc2_copy.inode, +                                                      local->loc2_copy.parent, +                                                      local->loc2_copy.name, +                                                      stbuf); +                                gf_uuid_copy (local->loc2.gfid, +                                              stbuf->ia_gfid); +                        } +                } +        } +          if (op_ret < 0) { -                /* The meaning of is_linkfile is overloaded here. For locking -                 * to work properly both rebalance and rename should acquire -                 * lock on datafile. The reason for sending this lookup is to -                 * find out whether we've acquired a lock on data file. -                 * Between the lookup before rename and this rename, the -                 * file could be migrated by a rebalance process and now this -                 * file this might be a linkto file. We verify that by sending -                 * this lookup. However, if this lookup fails we cannot really -                 * say whether we've acquired lock on a datafile or linkto file. -                 * So, we act conservatively and _assume_ -                 * that this is a linkfile and fail the rename operation. -                 */ -                local->is_linkfile = _gf_true; -                local->op_errno = op_errno; -        } else if (xattr && check_is_linkfile (inode, stbuf, xattr, +                if (is_src) { +                        /* The meaning of is_linkfile is overloaded here. For locking +                         * to work properly both rebalance and rename should acquire +                         * lock on datafile. The reason for sending this lookup is to +                         * find out whether we've acquired a lock on data file. +                         * Between the lookup before rename and this rename, the +                         * file could be migrated by a rebalance process and now this +                         * file this might be a linkto file. We verify that by sending +                         * this lookup. However, if this lookup fails we cannot really +                         * say whether we've acquired lock on a datafile or linkto file. +                         * So, we act conservatively and _assume_ +                         * that this is a linkfile and fail the rename operation. +                         */ +                        local->is_linkfile = _gf_true; +                        local->op_errno = op_errno; +                } else { +                        if (local->dst_cached) +                                gf_msg_debug (this->name, op_errno, +                                              "file %s (gfid:%s) was present " +                                              "(hashed-subvol=%s, " +                                              "cached-subvol=%s) before rename," +                                              " but lookup failed", +                                              local->loc2.path, +                                              uuid_utoa (local->loc2.inode->gfid), +                                              local->dst_hashed->name, +                                              local->dst_cached->name); +                        if (dht_inode_missing (op_errno)) +                                local->dst_cached = NULL; +                } +        } else if (is_src && xattr && check_is_linkfile (inode, stbuf, xattr,                                                 conf->link_xattr_name)) {                  local->is_linkfile = _gf_true;                  /* Found linkto file instead of data file, passdown ENOENT @@ -1500,11 +1585,9 @@ dht_rename_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  local->op_errno = ENOENT;          } -        if (!local->is_linkfile && -             gf_uuid_compare (local->lock[0].layout.parent_layout.locks[child_index]->loc.gfid, -             stbuf->ia_gfid)) { -                gf_uuid_unparse (local->lock[0].layout.parent_layout.locks[child_index]->loc.gfid, -                                 gfid_local); +        if (!local->is_linkfile && (op_ret >= 0) && +            gf_uuid_compare (loc->gfid, stbuf->ia_gfid)) { +                gf_uuid_unparse (loc->gfid, gfid_local);                  gf_uuid_unparse (stbuf->ia_gfid, gfid_server);                  gf_msg (this->name, GF_LOG_WARNING, 0, @@ -1537,6 +1620,123 @@ fail:          return 0;  } +int +dht_rename_file_lock1_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                           int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ +        dht_local_t *local                      = NULL; +        char         src_gfid[GF_UUID_BUF_SIZE] = {0}; +        char         dst_gfid[GF_UUID_BUF_SIZE] = {0}; +        int          ret                        = 0; +        loc_t       *loc                        = NULL; +        xlator_t    *subvol                     = NULL; + +        local = frame->local; + +        if (op_ret < 0) { +                uuid_utoa_r (local->loc.inode->gfid, src_gfid); + +                if (local->loc2.inode) +                        uuid_utoa_r (local->loc2.inode->gfid, dst_gfid); + +                gf_msg (this->name, GF_LOG_WARNING, op_errno, +                        DHT_MSG_INODE_LK_ERROR, +                        "protecting namespace of %s failed" +                        "rename (%s:%s:%s %s:%s:%s)", +                        local->current == &local->lock[0] ? local->loc.path +                        : local->loc2.path, +                        local->loc.path, src_gfid, local->src_hashed->name, +                        local->loc2.path, dst_gfid, +                        local->dst_hashed ? local->dst_hashed->name : NULL); + +                local->op_ret = -1; +                local->op_errno = op_errno; +                goto err; +        } + +        if (local->current == &local->lock[0]) { +                loc = &local->loc2; +                subvol = local->dst_hashed; +                local->current = &local->lock[1]; +        } else { +                loc = &local->loc; +                subvol = local->src_hashed; +                local->current = &local->lock[0]; +        } + +        ret = dht_protect_namespace (frame, loc, subvol, &local->current->ns, +                                     dht_rename_lock_cbk); +        if (ret < 0) { +                op_errno = EINVAL; +                goto err; +        } + +        return 0; +err: +        /* No harm in calling an extra unlock */ +        dht_rename_unlock (frame, this); +        return 0; +} + +int32_t +dht_rename_file_protect_namespace (call_frame_t *frame, void *cookie, +                                   xlator_t *this, int32_t op_ret, +                                   int32_t op_errno, dict_t *xdata) +{ +        dht_local_t  *local                     = NULL; +        char         src_gfid[GF_UUID_BUF_SIZE] = {0}; +        char         dst_gfid[GF_UUID_BUF_SIZE] = {0}; +        int          ret                        = 0; +        loc_t       *loc                        = NULL; +        xlator_t    *subvol                     = NULL; + +        local = frame->local; + +        if (op_ret < 0) { +                uuid_utoa_r (local->loc.inode->gfid, src_gfid); + +                if (local->loc2.inode) +                        uuid_utoa_r (local->loc2.inode->gfid, dst_gfid); + +                gf_msg (this->name, GF_LOG_WARNING, op_errno, +                        DHT_MSG_INODE_LK_ERROR, +                        "acquiring inodelk failed " +                        "rename (%s:%s:%s %s:%s:%s)", +                        local->loc.path, src_gfid, local->src_cached->name, +                        local->loc2.path, dst_gfid, +                        local->dst_cached ? local->dst_cached->name : NULL); + +                local->op_ret = -1; +                local->op_errno = op_errno; + +                goto err; +        } + +        /* Locks on src and dst needs to ordered which otherwise might cause +         * deadlocks when rename (src, dst) and rename (dst, src) is done from +         * two different clients +         */ +        dht_order_rename_lock (frame, &loc, &subvol); + +        ret = dht_protect_namespace (frame, loc, subvol, +                                     &local->current->ns, +                                     dht_rename_file_lock1_cbk); +        if (ret < 0) { +                op_errno = EINVAL; +                goto err; +        } + +        return 0; + +err: +        /* Its fine to call unlock even when no locks are acquired, as we check +         * for lock->locked before winding a unlock call. +         */ +        dht_rename_unlock (frame, this); + +        return 0; +} +  int32_t  dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                       int32_t op_ret, int32_t op_errno, dict_t *xdata) @@ -1547,8 +1747,8 @@ dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          dict_t      *xattr_req                  = NULL;          dht_conf_t  *conf                       = NULL;          int          i                          = 0; -        int          count                      = 0; - +        xlator_t    *subvol                     = NULL; +        dht_lock_t  *lock                       = NULL;          local = frame->local;          conf = this->private; @@ -1561,11 +1761,13 @@ dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  gf_msg (this->name, GF_LOG_WARNING, op_errno,                          DHT_MSG_INODE_LK_ERROR, -                        "acquiring inodelk failed " +                        "protecting namespace of %s failed. "                          "rename (%s:%s:%s %s:%s:%s)", -                        local->loc.path, src_gfid, local->src_cached->name, +                        local->current == &local->lock[0] ? local->loc.path +                        : local->loc2.path, +                        local->loc.path, src_gfid, local->src_hashed->name,                          local->loc2.path, dst_gfid, -                        local->dst_cached ? local->dst_cached->name : NULL); +                        local->dst_hashed ? local->dst_hashed->name : NULL);                  local->op_ret = -1;                  local->op_errno = op_errno; @@ -1588,7 +1790,19 @@ dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  goto done;          } -        count = local->call_cnt = local->lock[0].layout.parent_layout.lk_count; +        /* dst_cached might've changed. This normally happens for two reasons: +         * 1. rebalance migrated dst +         * 2. Another parallel rename was done overwriting dst +         * +         * Doing a lookup on local->loc2 when dst exists, but is associated +         * with a different gfid will result in an ESTALE error. So, do a fresh +         * lookup with a new inode on dst-path and handle change of dst-cached +         * in the cbk. Also, to identify dst-cached changes we do a lookup on +         * "this" rather than the subvol. +         */ +        loc_copy (&local->loc2_copy, &local->loc2); +        inode_unref (local->loc2_copy.inode); +        local->loc2_copy.inode = inode_new (local->loc.inode->table);          /* Why not use local->lock.locks[?].loc for lookup post lock phase           * --------------------------------------------------------------- @@ -1608,13 +1822,26 @@ dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,           * exists with the name that the client requested with.           * */ -        for (i = 0; i < count; i++) { -                STACK_WIND_COOKIE (frame, dht_rename_lookup_cbk, (void *)(long)i, -                                   local->lock[0].layout.parent_layout.locks[i]->xl, -                                   local->lock[0].layout.parent_layout.locks[i]->xl->fops->lookup, -                                   ((gf_uuid_compare (local->loc.gfid, \ -                                     local->lock[0].layout.parent_layout.locks[i]->loc.gfid) == 0) ? -                                    &local->loc : &local->loc2), xattr_req); +        local->call_cnt = 2; +        for (i = 0; i < 2; i++) { +                if (i == 0) { +                        lock = local->rename_inodelk_backward_compatible[0]; +                        if (gf_uuid_compare (local->loc.gfid, +                                             lock->loc.gfid) == 0) +                                subvol = lock->xl; +                        else { +                                lock = local->rename_inodelk_backward_compatible[1]; +                                subvol = lock->xl; +                        } +                } else { +                        subvol = this; +                } + +                STACK_WIND_COOKIE (frame, dht_rename_lookup_cbk, +                                   (void *)(long)i, subvol, +                                   subvol->fops->lookup, +                                   (i == 0) ? &local->loc : &local->loc2_copy, +                                   xattr_req);          }          dict_unref (xattr_req); @@ -1644,7 +1871,8 @@ dht_rename_lock (call_frame_t *frame)          if (local->dst_cached)                  count++; -        lk_array = GF_CALLOC (count, sizeof (*lk_array), gf_common_mt_pointer); +        lk_array = GF_CALLOC (count, sizeof (*lk_array), +                              gf_common_mt_pointer);          if (lk_array == NULL)                  goto err; @@ -1655,22 +1883,40 @@ dht_rename_lock (call_frame_t *frame)                  goto err;          if (local->dst_cached) { +                /* dst might be removed by the time inodelk reaches bricks, +                 * which can result in ESTALE errors. POSIX imposes no +                 * restriction for dst to be present for renames to be +                 * successful. So, we'll ignore ESTALE errors. As far as +                 * synchronization on dst goes, we'll achieve the same by +                 * holding entrylk on parent directory of dst in the namespace +                 * of basename(dst). Also, there might not be quorum in cluster +                 * xlators like EC/disperse on errno, in which case they return +                 * EIO. For eg., in a disperse (4 + 2), 3 might return success +                 * and three might return ESTALE. Disperse, having no Quorum +                 * unwinds inodelk with EIO. So, ignore EIO too. +                 */                  lk_array[1] = dht_lock_new (frame->this, local->dst_cached,                                              &local->loc2, F_WRLCK,                                              DHT_FILE_MIGRATE_DOMAIN, NULL, -                                            FAIL_ON_ANY_ERROR); +                                            IGNORE_ENOENT_ESTALE_EIO);                  if (lk_array[1] == NULL)                          goto err;          } -        local->lock[0].layout.parent_layout.locks = lk_array; -        local->lock[0].layout.parent_layout.lk_count = count; +        local->rename_inodelk_backward_compatible = lk_array; +        local->rename_inodelk_bc_count = count; +        /* retaining inodelks for the sake of backward compatibility. Please +         * make sure to remove this inodelk once all of 3.10, 3.12 and 3.13 +         * reach EOL. Better way of getting synchronization would be to acquire +         * entrylks on src and dst parent directories in the namespace of +         * basenames of src and dst +         */          ret = dht_blocking_inodelk (frame, lk_array, count, -                                    dht_rename_lock_cbk); +                                    dht_rename_file_protect_namespace);          if (ret < 0) { -                local->lock[0].layout.parent_layout.locks = NULL; -                local->lock[0].layout.parent_layout.lk_count = 0; +                local->rename_inodelk_backward_compatible = NULL; +                local->rename_inodelk_bc_count = 0;                  goto err;          } @@ -1701,6 +1947,7 @@ dht_rename (call_frame_t *frame, xlator_t *this,          dht_local_t *local                  = NULL;          dht_conf_t  *conf                   = NULL;          char         gfid[GF_UUID_BUF_SIZE] = {0}; +        char         newgfid[GF_UUID_BUF_SIZE] = {0};          VALIDATE_OR_GOTO (frame, err);          VALIDATE_OR_GOTO (this, err); @@ -1772,11 +2019,15 @@ dht_rename (call_frame_t *frame, xlator_t *this,          if (xdata)                  local->xattr_req = dict_ref (xdata); +        if (newloc->inode) +                gf_uuid_unparse(newloc->inode->gfid, newgfid); +          gf_msg (this->name, GF_LOG_INFO, 0,                  DHT_MSG_RENAME_INFO, -                "renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)", -                oldloc->path, src_hashed->name, src_cached->name, -                newloc->path, dst_hashed->name, +                "renaming %s (%s) (hash=%s/cache=%s) => %s (%s) " +                "(hash=%s/cache=%s) ", +                oldloc->path, gfid, src_hashed->name, src_cached->name, +                newloc->path, newloc->inode ? newgfid : NULL, dst_hashed->name,                  dst_cached ? dst_cached->name : "<nul>");          if (IA_ISDIR (oldloc->inode->ia_type)) { @@ -1784,8 +2035,10 @@ dht_rename (call_frame_t *frame, xlator_t *this,          } else {                  local->op_ret = 0;                  ret = dht_rename_lock (frame); -                if (ret < 0) +                if (ret < 0) { +                        op_errno = ENOMEM;                          goto err; +                }          }          return 0;  | 
