diff options
| author | Susant Palai <spalai@redhat.com> | 2018-01-18 13:06:12 +0530 | 
|---|---|---|
| committer | Raghavendra G <rgowdapp@redhat.com> | 2018-02-02 15:24:38 +0000 | 
| commit | 545a7ce6762a1b3a7b989b43a9d18b5b1b299df0 (patch) | |
| tree | 0f2c3015697553914cb520dbda107f3843521f53 | |
| parent | d9f773ba719397c12860f494a8cd38109e4b2fe3 (diff) | |
cluster/dht: avoid overwriting client writes during migration
For more details on this issue see
https://github.com/gluster/glusterfs/issues/308
Solution:
This is a restrictive solution where a file will not be migrated
if a client writes to it during the migration. This does not
check if the writes from the rebalance and the client actually
do overlap.
If dht_writev_cbk finds that the file is being migrated (PHASE1)
it will set an xattr on the destination file indicating the file
was updated by a non-rebalance client.
Rebalance checks if any other client has written to the dst file
and aborts the file migration if it finds the xattr.
updates gluster/glusterfs#308
Change-Id: I73aec28bc9dbb8da57c7425ec88c6b6af0fbc9dd
Signed-off-by: Susant Palai <spalai@redhat.com>
Signed-off-by: Raghavendra G <rgowdapp@redhat.com>
Signed-off-by: N Balachandran <nbalacha@redhat.com>
| -rw-r--r-- | libglusterfs/src/globals.h | 5 | ||||
| -rw-r--r-- | tests/basic/distribute/force-migration.t | 50 | ||||
| -rwxr-xr-x | tests/basic/tier/fops-during-migration.t | 1 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 2 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-inode-write.c | 28 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 103 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 17 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 8 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-entry-ops.c | 12 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-handle.h | 4 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-helpers.c | 42 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-inode-fd-ops.c | 9 | 
12 files changed, 269 insertions, 12 deletions
| diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h index 315eec1e667..487facd9183 100644 --- a/libglusterfs/src/globals.h +++ b/libglusterfs/src/globals.h @@ -19,6 +19,11 @@  #define GD_MIN_OP_VERSION_KEY "minimum-operating-version"  #define GD_MAX_OP_VERSION_KEY "maximum-operating-version" +#define GF_PROTECT_FROM_EXTERNAL_WRITES "trusted.glusterfs.protect.writes" +#define GF_AVOID_OVERWRITE "glusterfs.avoid.overwrite" +#define GF_CLEAN_WRITE_PROTECTION "glusterfs.clean.writexattr" + +  /* Gluster versions - OP-VERSION mapping   *   * 3.3.x                - 1 diff --git a/tests/basic/distribute/force-migration.t b/tests/basic/distribute/force-migration.t new file mode 100644 index 00000000000..f6c4997a505 --- /dev/null +++ b/tests/basic/distribute/force-migration.t @@ -0,0 +1,50 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +#This tests checks if the file migration fails with force-migration +#option set to off. + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 +TEST $CLI volume set $V0 performance.quick-read off +TEST $CLI volume set $V0 performance.io-cache off +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.stat-prefetch off +TEST $CLI volume set $V0 performance.read-ahead off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 +TEST touch $M0/file +#This rename creates a link file for tile in the other brick. +TEST mv $M0/file $M0/tile +#Lets keep writing to the file which will have a open fd +dd if=/dev/zero of=$M0/tile bs=1b & +bg_pid=$! +#Now rebalance will try to skip the file +TEST $CLI volume set $V0 force-migration off +TEST $CLI volume rebalance $V0 start force +EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" rebalance_status_field $V0 +skippedcount=`gluster v rebalance $V0 status | awk 'NR==3{print $6}'` +TEST [[ $skippedcount -eq 1 ]] +#file should be migrated now +TEST $CLI volume set $V0 force-migration on +TEST $CLI volume rebalance $V0 start force +EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" rebalance_status_field $V0 +skippedcount=`gluster v rebalance $V0 status | awk 'NR==3{print $6}'` +rebalancedcount=`gluster v rebalance $V0 status | awk 'NR==3{print $2}'` +TEST [[ $skippedcount -eq 0 ]] +TEST [[ $rebalancedcount -eq 1 ]] +kill -9 $bg_pid > /dev/null 2>&1 +wait > /dev/null 2>&1 +cleanup +#Bad test because we are not sure writes are happening at the time of +#rebalance. We need to write a test case which makes sure client +#writes happen during rebalance. One way would be to set S+T bits on +#src and write to file from client and then start rebalance. Currently +#marking this as bad test. +#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000 + diff --git a/tests/basic/tier/fops-during-migration.t b/tests/basic/tier/fops-during-migration.t index c85b4e337cc..458c01e93c5 100755 --- a/tests/basic/tier/fops-during-migration.t +++ b/tests/basic/tier/fops-during-migration.t @@ -22,6 +22,7 @@ function create_dist_tier_vol () {          TEST $CLI volume set $V0 performance.quick-read off          TEST $CLI volume set $V0 performance.io-cache off          TEST $CLI volume set $V0 features.ctr-enabled on +        TEST $CLI volume set $V0 cluster.force-migration on          TEST $CLI volume start $V0          TEST $CLI volume tier $V0 attach $H0:$B0/hot/${V0}{0..$1}          TEST $CLI volume set $V0 cluster.tier-demote-frequency $DEMOTE_FREQ diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 91ba3418643..9671bbe1cbe 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -695,6 +695,8 @@ struct dht_conf {          synclock_t      link_lock;          gf_boolean_t    use_fallocate; + +        gf_boolean_t    force_migration;  };  typedef struct dht_conf dht_conf_t; diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index 7c596b1c099..226ee95e8b3 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -95,6 +95,33 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          /* Check if the rebalance phase1 is true */          if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { +                if (!dht_is_tier_xlator (this)) { +                        if (!local->xattr_req) { +                                local->xattr_req = dict_new (); +                                if (!local->xattr_req) { +                                        gf_msg (this->name, GF_LOG_ERROR, +                                                DHT_MSG_NO_MEMORY, +                                                ENOMEM, "insufficient memory"); +                                        local->op_errno = ENOMEM; +                                        local->op_ret = -1; +                                        goto out; +                                } +                        } + +                        ret = dict_set_uint32 (local->xattr_req, +                                               GF_PROTECT_FROM_EXTERNAL_WRITES, +                                               1); +                        if (ret) { +                                gf_msg (this->name, GF_LOG_ERROR, +                                        DHT_MSG_DICT_SET_FAILED, 0, +                                        "Failed to set key %s in dictionary", +                                        GF_PROTECT_FROM_EXTERNAL_WRITES); +                                local->op_errno = ENOMEM; +                                local->op_ret = -1; +                                goto out; +                        } +                } +                  dht_iatt_merge (this, &local->stbuf, postbuf, NULL);                  dht_iatt_merge (this, &local->prebuf, prebuf, NULL); @@ -146,7 +173,6 @@ dht_writev2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)                  return 0;          } -          if (subvol == NULL)                  goto out; diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 70d5a5f316f..d9c3149049c 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -26,7 +26,8 @@  #define MAX_REBAL_TYPE_SIZE               16  #define FILE_CNT_INTERVAL                600 /* 10 mins */  #define ESTIMATE_START_INTERVAL          600 /* 10 mins */ - +#define HARDLINK_MIG_INPROGRESS          -2 +#define SKIP_MIGRATION_FD_POSITIVE       -3  #ifndef MAX  #define MAX(a, b) (((a) > (b))?(a):(b))  #endif @@ -680,6 +681,7 @@ __dht_rebalance_create_dst_file (xlator_t *this, xlator_t *to, xlator_t *from,          struct iatt  check_stbuf= {0,};          dht_conf_t  *conf = NULL;          dict_t      *dict = NULL; +        dict_t      *xdata = NULL;          conf = this->private; @@ -725,7 +727,31 @@ __dht_rebalance_create_dst_file (xlator_t *this, xlator_t *to, xlator_t *from,                  goto out;          } -        ret = syncop_lookup (to, loc, &new_stbuf, NULL, NULL, NULL); +        if (!!dht_is_tier_xlator (this)) { +                xdata = dict_new (); +                if (!xdata) { +                        *fop_errno = ENOMEM; +                        ret = -1; +                        gf_msg (this->name, GF_LOG_ERROR, ENOMEM, +                                DHT_MSG_MIGRATE_FILE_FAILED, +                                "%s: dict_new failed)", +                                loc->path); +                        goto out; +                } + +                ret = dict_set_int32 (xdata, GF_CLEAN_WRITE_PROTECTION, 1); +                if (ret) { +                        *fop_errno = ENOMEM; +                        ret = -1; +                        gf_msg (this->name, GF_LOG_ERROR, 0, +                                DHT_MSG_DICT_SET_FAILED, +                                "%s: failed to set dictionary value: key = %s ", +                                loc->path, GF_CLEAN_WRITE_PROTECTION); +                        goto out; +                } +        } + +        ret = syncop_lookup (to, loc, &new_stbuf, NULL, xdata, NULL);          if (!ret) {                  /* File exits in the destination, check if gfid matches */                  if (gf_uuid_compare (stbuf->ia_gfid, new_stbuf.ia_gfid) != 0) { @@ -875,6 +901,10 @@ out:          if (dict)                  dict_unref (dict); +        if (xdata) +                dict_unref (dict); + +          return ret;  } @@ -1090,9 +1120,9 @@ out:  }  static int -__dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from, -                              xlator_t *to, fd_t *src, fd_t *dst, -                              uint64_t ia_size, int hole_exists, +__dht_rebalance_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, +                              xlator_t *from, xlator_t *to, fd_t *src, +                              fd_t *dst, uint64_t ia_size, int hole_exists,                                int *fop_errno)  {          int            ret    = 0; @@ -1102,7 +1132,10 @@ __dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from,          struct iobref *iobref = NULL;          uint64_t       total  = 0;          size_t         read_size = 0; +        dict_t        *xdata = NULL; +        dht_conf_t    *conf  = NULL; +        conf = this->private;          /* if file size is '0', no need to enter this loop */          while (total < ia_size) {                  read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) ? @@ -1121,8 +1154,42 @@ __dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from,                                                      ret, offset, iobref,                                                      fop_errno);                  } else { +                        if (!conf->force_migration && +                            !dht_is_tier_xlator (this)) { +                                xdata = dict_new (); +                                if (!xdata) { +                                        gf_msg ("dht", GF_LOG_ERROR, 0, +                                                DHT_MSG_MIGRATE_FILE_FAILED, +                                                "insufficient memory"); +                                        ret = -1; +                                        *fop_errno = ENOMEM; +                                        break; +                                } + +                                /* Fail this write and abort rebalance if we +                                 * detect a write from client since migration of +                                 * this file started. This is done to avoid +                                 * potential data corruption due to out of order +                                 * writes from rebalance and client to the same +                                 * region (as compared between src and dst +                                 * files). See +                                 * https://github.com/gluster/glusterfs/issues/308 +                                 * for more details. +                                 */ +                                ret = dict_set_int32 (xdata, +                                                      GF_AVOID_OVERWRITE, 1); +                                if (ret) { +                                        gf_msg ("dht", GF_LOG_ERROR, 0, +                                                ENOMEM, "failed to set dict"); +                                        ret = -1; +                                        *fop_errno = ENOMEM; +                                        break; +                                } + +                        } +                          ret = syncop_writev (to, dst, vector, count, -                                             offset, iobref, 0, NULL, NULL); +                                             offset, iobref, 0, xdata, NULL);                          if (ret < 0) {                                  *fop_errno = -ret;                          } @@ -1158,6 +1225,10 @@ __dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from,          else                  ret = -1; +        if (xdata) { +                dict_unref (xdata); +        } +          return ret;  } @@ -1575,7 +1646,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,                  goto out;          } -          /* Do not migrate file in case lock migration is not enabled on the           * volume*/          if (!conf->lock_migration_enabled) { @@ -1642,7 +1712,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,          ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag, defrag, conf,                                      fop_errno);          if (ret) { -                if (ret == -2) +                if (ret == HARDLINK_MIG_INPROGRESS)                          ret = 0;                  goto out;          } @@ -1785,7 +1855,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,          ret = __check_file_has_hardlink (this, loc, &stbuf, xattr_rsp,                                           flag, defrag, conf, fop_errno);          if (ret) { -                if (ret == -2) +                if (ret == HARDLINK_MIG_INPROGRESS)                          ret = 0;                  goto out;          } @@ -1794,8 +1864,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,                  file_has_holes = 1; -        ret = __dht_rebalance_migrate_data (defrag, from, to, src_fd, dst_fd, -                                            stbuf.ia_size, +        ret = __dht_rebalance_migrate_data (this, defrag, from, to, +                                            src_fd, dst_fd, stbuf.ia_size,                                              file_has_holes, fop_errno);          if (ret) {                  gf_msg (this->name, GF_LOG_ERROR, 0, @@ -2280,6 +2350,17 @@ out:                  }          } +        if (!dht_is_tier_xlator (this)) { +                lk_ret = syncop_removexattr (to, loc, +                                             GF_PROTECT_FROM_EXTERNAL_WRITES, +                                             NULL, NULL); +                if (lk_ret) { +                        gf_msg (this->name, GF_LOG_WARNING, -lk_ret, 0, +                                "%s: removexattr failed key %s", loc->path, +                                GF_CLEAN_WRITE_PROTECTION); +                } +        } +          if (dict)                  dict_unref (dict); diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 8c011f72530..51c9d9cb3cf 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -524,6 +524,10 @@ dht_reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("lock-migration", conf->lock_migration_enabled,                            options, bool, out); +        GF_OPTION_RECONF ("force-migration", conf->force_migration, +                          options, bool, out); + +          if (conf->defrag) {                  if (dict_get_str (options, "rebal-throttle", &temp_str) == 0) {                          ret = dht_configure_throttle (this, conf, temp_str); @@ -810,6 +814,10 @@ dht_init (xlator_t *this)          GF_OPTION_INIT ("lock-migration", conf->lock_migration_enabled,                           bool, err); +        GF_OPTION_INIT ("force-migration", conf->force_migration, +                        bool, err); + +          if (defrag) {                defrag->lock_migration_enabled = conf->lock_migration_enabled; @@ -1203,5 +1211,14 @@ struct volume_options options[] = {            .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC          }, +        { .key =  {"force-migration"}, +          .type = GF_OPTION_TYPE_BOOL, +          .default_value = "off", +          .description = "If disabled, rebalance will not migrate files that " +                         "are being written to by an application", +          .op_version  = {GD_OP_VERSION_4_0_0}, +          .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC +        }, +          { .key  = {NULL} },  }; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 50827caaa0a..e293acfa2cc 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1349,6 +1349,14 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .flags       = VOLOPT_FLAG_CLIENT_OPT,          }, +        { .key         = "cluster.force-migration", +          .voltype     = "cluster/distribute", +          .option      = "force-migration", +          .value       = "off", +          .op_version  = GD_OP_VERSION_4_0_0, +          .flags       = VOLOPT_FLAG_CLIENT_OPT, +        }, +          /* NUFA xlator options (Distribute special case) */          { .key        = "cluster.nufa",            .voltype    = "cluster/distribute", diff --git a/xlators/storage/posix/src/posix-entry-ops.c b/xlators/storage/posix/src/posix-entry-ops.c index 4d7ed5be7c8..41d8c873b4c 100644 --- a/xlators/storage/posix/src/posix-entry-ops.c +++ b/xlators/storage/posix/src/posix-entry-ops.c @@ -180,6 +180,7 @@ posix_lookup (call_frame_t *frame, xlator_t *this,          int32_t     nlink_samepgfid    = 0;          struct  posix_private *priv    = NULL;          posix_inode_ctx_t *ctx         = NULL; +        int         ret                = 0;          VALIDATE_OR_GOTO (frame, out);          VALIDATE_OR_GOTO (this, out); @@ -248,6 +249,17 @@ posix_lookup (call_frame_t *frame, xlator_t *this,          if (xdata && (op_ret == 0)) {                  xattr = posix_xattr_fill (this, real_path, loc, NULL, -1, xdata,                                            &buf); + +                if (dict_get (xdata, GF_CLEAN_WRITE_PROTECTION)) { +                        ret = sys_lremovexattr (real_path, +                                                GF_PROTECT_FROM_EXTERNAL_WRITES); +                        if (ret == -1 && (errno != ENODATA && errno != ENOATTR)) +                                gf_msg (this->name, GF_LOG_ERROR, +                                        P_MSG_XATTR_NOT_REMOVED, errno, +                                        "removexattr failed. key %s path %s", +                                        GF_PROTECT_FROM_EXTERNAL_WRITES, +                                        loc->path); +                }          }          if (priv->update_pgfid_nlinks) { diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h index 97186f91e64..8a07cf2b57d 100644 --- a/xlators/storage/posix/src/posix-handle.h +++ b/xlators/storage/posix/src/posix-handle.h @@ -186,4 +186,8 @@ int  posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid,                                    char *real_path, inode_table_t *itable); +int +posix_check_internal_writes (xlator_t *this, fd_t *fd, int sysfd, +                             dict_t *xdata); +  #endif /* !_POSIX_HANDLE_H */ diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index 90afced604c..56c9b4afe94 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -2739,3 +2739,45 @@ posix_override_umask (mode_t mode, mode_t mode_bit)          gf_msg_debug ("posix", 0, "The value of mode is %u", mode);          return mode;  } + +int +posix_check_internal_writes (xlator_t *this, fd_t *fd, int sysfd, +                             dict_t *xdata) +{ +        int          ret = 0; +        size_t       xattrsize = 0; +        data_t      *val = NULL; + +        LOCK (&fd->inode->lock); +        { +                val = dict_get (xdata, GF_PROTECT_FROM_EXTERNAL_WRITES); +                if (val) { +                        ret = sys_fsetxattr (sysfd, +                                             GF_PROTECT_FROM_EXTERNAL_WRITES, +                                             val->data, val->len, 0); +                        if (ret == -1) { +                                gf_msg (this->name, GF_LOG_ERROR, +                                        P_MSG_XATTR_FAILED, +                                        errno, "setxattr failed key %s", +                                        GF_PROTECT_FROM_EXTERNAL_WRITES); +                        } + +                        goto out; +                } + +                if (dict_get (xdata, GF_AVOID_OVERWRITE)) { +                        xattrsize = sys_fgetxattr (sysfd, +                                                   GF_PROTECT_FROM_EXTERNAL_WRITES, +                                                   NULL, 0); +                        if ((xattrsize == -1) && ((errno == ENOATTR) || +                            (errno == ENODATA))) { +                                ret = 0; +                        } else { +                                ret = -1; +                        } +                } +        } +out: +        UNLOCK (&fd->inode->lock); +        return ret; +} diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c index c90bc6c438a..812cf792874 100644 --- a/xlators/storage/posix/src/posix-inode-fd-ops.c +++ b/xlators/storage/posix/src/posix-inode-fd-ops.c @@ -1579,6 +1579,15 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,          _fd = pfd->fd; +        ret = posix_check_internal_writes (this, fd, _fd, xdata); +        if (ret < 0) { +                gf_msg (this->name, GF_LOG_ERROR, 0, 0, +                        "possible overwrite from internal client, fd=%p", fd); +                op_ret = -1; +                op_errno = EBUSY; +                goto out; +        } +          if (xdata) {                  if (dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND))                          write_append = _gf_true; | 
