diff options
author | Susant Palai <spalai@redhat.com> | 2018-01-18 13:06:12 +0530 |
---|---|---|
committer | Raghavendra G <rgowdapp@redhat.com> | 2018-02-02 15:24:38 +0000 |
commit | 545a7ce6762a1b3a7b989b43a9d18b5b1b299df0 (patch) | |
tree | 0f2c3015697553914cb520dbda107f3843521f53 | |
parent | d9f773ba719397c12860f494a8cd38109e4b2fe3 (diff) |
cluster/dht: avoid overwriting client writes during migration
For more details on this issue see
https://github.com/gluster/glusterfs/issues/308
Solution:
This is a restrictive solution where a file will not be migrated
if a client writes to it during the migration. This does not
check if the writes from the rebalance and the client actually
do overlap.
If dht_writev_cbk finds that the file is being migrated (PHASE1)
it will set an xattr on the destination file indicating the file
was updated by a non-rebalance client.
Rebalance checks if any other client has written to the dst file
and aborts the file migration if it finds the xattr.
updates gluster/glusterfs#308
Change-Id: I73aec28bc9dbb8da57c7425ec88c6b6af0fbc9dd
Signed-off-by: Susant Palai <spalai@redhat.com>
Signed-off-by: Raghavendra G <rgowdapp@redhat.com>
Signed-off-by: N Balachandran <nbalacha@redhat.com>
-rw-r--r-- | libglusterfs/src/globals.h | 5 | ||||
-rw-r--r-- | tests/basic/distribute/force-migration.t | 50 | ||||
-rwxr-xr-x | tests/basic/tier/fops-during-migration.t | 1 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 2 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-inode-write.c | 28 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 103 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 17 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 8 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-entry-ops.c | 12 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-handle.h | 4 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-helpers.c | 42 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-inode-fd-ops.c | 9 |
12 files changed, 269 insertions, 12 deletions
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h index 315eec1e667..487facd9183 100644 --- a/libglusterfs/src/globals.h +++ b/libglusterfs/src/globals.h @@ -19,6 +19,11 @@ #define GD_MIN_OP_VERSION_KEY "minimum-operating-version" #define GD_MAX_OP_VERSION_KEY "maximum-operating-version" +#define GF_PROTECT_FROM_EXTERNAL_WRITES "trusted.glusterfs.protect.writes" +#define GF_AVOID_OVERWRITE "glusterfs.avoid.overwrite" +#define GF_CLEAN_WRITE_PROTECTION "glusterfs.clean.writexattr" + + /* Gluster versions - OP-VERSION mapping * * 3.3.x - 1 diff --git a/tests/basic/distribute/force-migration.t b/tests/basic/distribute/force-migration.t new file mode 100644 index 00000000000..f6c4997a505 --- /dev/null +++ b/tests/basic/distribute/force-migration.t @@ -0,0 +1,50 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +#This tests checks if the file migration fails with force-migration +#option set to off. + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 $H0:$B0/${V0}0 $H0:$B0/${V0}1 +TEST $CLI volume set $V0 performance.quick-read off +TEST $CLI volume set $V0 performance.io-cache off +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.stat-prefetch off +TEST $CLI volume set $V0 performance.read-ahead off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 +TEST touch $M0/file +#This rename creates a link file for tile in the other brick. +TEST mv $M0/file $M0/tile +#Lets keep writing to the file which will have a open fd +dd if=/dev/zero of=$M0/tile bs=1b & +bg_pid=$! +#Now rebalance will try to skip the file +TEST $CLI volume set $V0 force-migration off +TEST $CLI volume rebalance $V0 start force +EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" rebalance_status_field $V0 +skippedcount=`gluster v rebalance $V0 status | awk 'NR==3{print $6}'` +TEST [[ $skippedcount -eq 1 ]] +#file should be migrated now +TEST $CLI volume set $V0 force-migration on +TEST $CLI volume rebalance $V0 start force +EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" rebalance_status_field $V0 +skippedcount=`gluster v rebalance $V0 status | awk 'NR==3{print $6}'` +rebalancedcount=`gluster v rebalance $V0 status | awk 'NR==3{print $2}'` +TEST [[ $skippedcount -eq 0 ]] +TEST [[ $rebalancedcount -eq 1 ]] +kill -9 $bg_pid > /dev/null 2>&1 +wait > /dev/null 2>&1 +cleanup +#Bad test because we are not sure writes are happening at the time of +#rebalance. We need to write a test case which makes sure client +#writes happen during rebalance. One way would be to set S+T bits on +#src and write to file from client and then start rebalance. Currently +#marking this as bad test. +#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000 + diff --git a/tests/basic/tier/fops-during-migration.t b/tests/basic/tier/fops-during-migration.t index c85b4e337cc..458c01e93c5 100755 --- a/tests/basic/tier/fops-during-migration.t +++ b/tests/basic/tier/fops-during-migration.t @@ -22,6 +22,7 @@ function create_dist_tier_vol () { TEST $CLI volume set $V0 performance.quick-read off TEST $CLI volume set $V0 performance.io-cache off TEST $CLI volume set $V0 features.ctr-enabled on + TEST $CLI volume set $V0 cluster.force-migration on TEST $CLI volume start $V0 TEST $CLI volume tier $V0 attach $H0:$B0/hot/${V0}{0..$1} TEST $CLI volume set $V0 cluster.tier-demote-frequency $DEMOTE_FREQ diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 91ba3418643..9671bbe1cbe 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -695,6 +695,8 @@ struct dht_conf { synclock_t link_lock; gf_boolean_t use_fallocate; + + gf_boolean_t force_migration; }; typedef struct dht_conf dht_conf_t; diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index 7c596b1c099..226ee95e8b3 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -95,6 +95,33 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, /* Check if the rebalance phase1 is true */ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + if (!dht_is_tier_xlator (this)) { + if (!local->xattr_req) { + local->xattr_req = dict_new (); + if (!local->xattr_req) { + gf_msg (this->name, GF_LOG_ERROR, + DHT_MSG_NO_MEMORY, + ENOMEM, "insufficient memory"); + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; + } + } + + ret = dict_set_uint32 (local->xattr_req, + GF_PROTECT_FROM_EXTERNAL_WRITES, + 1); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, + DHT_MSG_DICT_SET_FAILED, 0, + "Failed to set key %s in dictionary", + GF_PROTECT_FROM_EXTERNAL_WRITES); + local->op_errno = ENOMEM; + local->op_ret = -1; + goto out; + } + } + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); @@ -146,7 +173,6 @@ dht_writev2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret) return 0; } - if (subvol == NULL) goto out; diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 70d5a5f316f..d9c3149049c 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -26,7 +26,8 @@ #define MAX_REBAL_TYPE_SIZE 16 #define FILE_CNT_INTERVAL 600 /* 10 mins */ #define ESTIMATE_START_INTERVAL 600 /* 10 mins */ - +#define HARDLINK_MIG_INPROGRESS -2 +#define SKIP_MIGRATION_FD_POSITIVE -3 #ifndef MAX #define MAX(a, b) (((a) > (b))?(a):(b)) #endif @@ -680,6 +681,7 @@ __dht_rebalance_create_dst_file (xlator_t *this, xlator_t *to, xlator_t *from, struct iatt check_stbuf= {0,}; dht_conf_t *conf = NULL; dict_t *dict = NULL; + dict_t *xdata = NULL; conf = this->private; @@ -725,7 +727,31 @@ __dht_rebalance_create_dst_file (xlator_t *this, xlator_t *to, xlator_t *from, goto out; } - ret = syncop_lookup (to, loc, &new_stbuf, NULL, NULL, NULL); + if (!!dht_is_tier_xlator (this)) { + xdata = dict_new (); + if (!xdata) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg (this->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: dict_new failed)", + loc->path); + goto out; + } + + ret = dict_set_int32 (xdata, GF_CLEAN_WRITE_PROTECTION, 1); + if (ret) { + *fop_errno = ENOMEM; + ret = -1; + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_DICT_SET_FAILED, + "%s: failed to set dictionary value: key = %s ", + loc->path, GF_CLEAN_WRITE_PROTECTION); + goto out; + } + } + + ret = syncop_lookup (to, loc, &new_stbuf, NULL, xdata, NULL); if (!ret) { /* File exits in the destination, check if gfid matches */ if (gf_uuid_compare (stbuf->ia_gfid, new_stbuf.ia_gfid) != 0) { @@ -875,6 +901,10 @@ out: if (dict) dict_unref (dict); + if (xdata) + dict_unref (dict); + + return ret; } @@ -1090,9 +1120,9 @@ out: } static int -__dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from, - xlator_t *to, fd_t *src, fd_t *dst, - uint64_t ia_size, int hole_exists, +__dht_rebalance_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, + xlator_t *from, xlator_t *to, fd_t *src, + fd_t *dst, uint64_t ia_size, int hole_exists, int *fop_errno) { int ret = 0; @@ -1102,7 +1132,10 @@ __dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from, struct iobref *iobref = NULL; uint64_t total = 0; size_t read_size = 0; + dict_t *xdata = NULL; + dht_conf_t *conf = NULL; + conf = this->private; /* if file size is '0', no need to enter this loop */ while (total < ia_size) { read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) ? @@ -1121,8 +1154,42 @@ __dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from, ret, offset, iobref, fop_errno); } else { + if (!conf->force_migration && + !dht_is_tier_xlator (this)) { + xdata = dict_new (); + if (!xdata) { + gf_msg ("dht", GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "insufficient memory"); + ret = -1; + *fop_errno = ENOMEM; + break; + } + + /* Fail this write and abort rebalance if we + * detect a write from client since migration of + * this file started. This is done to avoid + * potential data corruption due to out of order + * writes from rebalance and client to the same + * region (as compared between src and dst + * files). See + * https://github.com/gluster/glusterfs/issues/308 + * for more details. + */ + ret = dict_set_int32 (xdata, + GF_AVOID_OVERWRITE, 1); + if (ret) { + gf_msg ("dht", GF_LOG_ERROR, 0, + ENOMEM, "failed to set dict"); + ret = -1; + *fop_errno = ENOMEM; + break; + } + + } + ret = syncop_writev (to, dst, vector, count, - offset, iobref, 0, NULL, NULL); + offset, iobref, 0, xdata, NULL); if (ret < 0) { *fop_errno = -ret; } @@ -1158,6 +1225,10 @@ __dht_rebalance_migrate_data (gf_defrag_info_t *defrag, xlator_t *from, else ret = -1; + if (xdata) { + dict_unref (xdata); + } + return ret; } @@ -1575,7 +1646,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } - /* Do not migrate file in case lock migration is not enabled on the * volume*/ if (!conf->lock_migration_enabled) { @@ -1642,7 +1712,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag, defrag, conf, fop_errno); if (ret) { - if (ret == -2) + if (ret == HARDLINK_MIG_INPROGRESS) ret = 0; goto out; } @@ -1785,7 +1855,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, ret = __check_file_has_hardlink (this, loc, &stbuf, xattr_rsp, flag, defrag, conf, fop_errno); if (ret) { - if (ret == -2) + if (ret == HARDLINK_MIG_INPROGRESS) ret = 0; goto out; } @@ -1794,8 +1864,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, file_has_holes = 1; - ret = __dht_rebalance_migrate_data (defrag, from, to, src_fd, dst_fd, - stbuf.ia_size, + ret = __dht_rebalance_migrate_data (this, defrag, from, to, + src_fd, dst_fd, stbuf.ia_size, file_has_holes, fop_errno); if (ret) { gf_msg (this->name, GF_LOG_ERROR, 0, @@ -2280,6 +2350,17 @@ out: } } + if (!dht_is_tier_xlator (this)) { + lk_ret = syncop_removexattr (to, loc, + GF_PROTECT_FROM_EXTERNAL_WRITES, + NULL, NULL); + if (lk_ret) { + gf_msg (this->name, GF_LOG_WARNING, -lk_ret, 0, + "%s: removexattr failed key %s", loc->path, + GF_CLEAN_WRITE_PROTECTION); + } + } + if (dict) dict_unref (dict); diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 8c011f72530..51c9d9cb3cf 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -524,6 +524,10 @@ dht_reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("lock-migration", conf->lock_migration_enabled, options, bool, out); + GF_OPTION_RECONF ("force-migration", conf->force_migration, + options, bool, out); + + if (conf->defrag) { if (dict_get_str (options, "rebal-throttle", &temp_str) == 0) { ret = dht_configure_throttle (this, conf, temp_str); @@ -810,6 +814,10 @@ dht_init (xlator_t *this) GF_OPTION_INIT ("lock-migration", conf->lock_migration_enabled, bool, err); + GF_OPTION_INIT ("force-migration", conf->force_migration, + bool, err); + + if (defrag) { defrag->lock_migration_enabled = conf->lock_migration_enabled; @@ -1203,5 +1211,14 @@ struct volume_options options[] = { .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC }, + { .key = {"force-migration"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "If disabled, rebalance will not migrate files that " + "are being written to by an application", + .op_version = {GD_OP_VERSION_4_0_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC + }, + { .key = {NULL} }, }; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 50827caaa0a..e293acfa2cc 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1349,6 +1349,14 @@ struct volopt_map_entry glusterd_volopt_map[] = { .flags = VOLOPT_FLAG_CLIENT_OPT, }, + { .key = "cluster.force-migration", + .voltype = "cluster/distribute", + .option = "force-migration", + .value = "off", + .op_version = GD_OP_VERSION_4_0_0, + .flags = VOLOPT_FLAG_CLIENT_OPT, + }, + /* NUFA xlator options (Distribute special case) */ { .key = "cluster.nufa", .voltype = "cluster/distribute", diff --git a/xlators/storage/posix/src/posix-entry-ops.c b/xlators/storage/posix/src/posix-entry-ops.c index 4d7ed5be7c8..41d8c873b4c 100644 --- a/xlators/storage/posix/src/posix-entry-ops.c +++ b/xlators/storage/posix/src/posix-entry-ops.c @@ -180,6 +180,7 @@ posix_lookup (call_frame_t *frame, xlator_t *this, int32_t nlink_samepgfid = 0; struct posix_private *priv = NULL; posix_inode_ctx_t *ctx = NULL; + int ret = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -248,6 +249,17 @@ posix_lookup (call_frame_t *frame, xlator_t *this, if (xdata && (op_ret == 0)) { xattr = posix_xattr_fill (this, real_path, loc, NULL, -1, xdata, &buf); + + if (dict_get (xdata, GF_CLEAN_WRITE_PROTECTION)) { + ret = sys_lremovexattr (real_path, + GF_PROTECT_FROM_EXTERNAL_WRITES); + if (ret == -1 && (errno != ENODATA && errno != ENOATTR)) + gf_msg (this->name, GF_LOG_ERROR, + P_MSG_XATTR_NOT_REMOVED, errno, + "removexattr failed. key %s path %s", + GF_PROTECT_FROM_EXTERNAL_WRITES, + loc->path); + } } if (priv->update_pgfid_nlinks) { diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h index 97186f91e64..8a07cf2b57d 100644 --- a/xlators/storage/posix/src/posix-handle.h +++ b/xlators/storage/posix/src/posix-handle.h @@ -186,4 +186,8 @@ int posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid, char *real_path, inode_table_t *itable); +int +posix_check_internal_writes (xlator_t *this, fd_t *fd, int sysfd, + dict_t *xdata); + #endif /* !_POSIX_HANDLE_H */ diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index 90afced604c..56c9b4afe94 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -2739,3 +2739,45 @@ posix_override_umask (mode_t mode, mode_t mode_bit) gf_msg_debug ("posix", 0, "The value of mode is %u", mode); return mode; } + +int +posix_check_internal_writes (xlator_t *this, fd_t *fd, int sysfd, + dict_t *xdata) +{ + int ret = 0; + size_t xattrsize = 0; + data_t *val = NULL; + + LOCK (&fd->inode->lock); + { + val = dict_get (xdata, GF_PROTECT_FROM_EXTERNAL_WRITES); + if (val) { + ret = sys_fsetxattr (sysfd, + GF_PROTECT_FROM_EXTERNAL_WRITES, + val->data, val->len, 0); + if (ret == -1) { + gf_msg (this->name, GF_LOG_ERROR, + P_MSG_XATTR_FAILED, + errno, "setxattr failed key %s", + GF_PROTECT_FROM_EXTERNAL_WRITES); + } + + goto out; + } + + if (dict_get (xdata, GF_AVOID_OVERWRITE)) { + xattrsize = sys_fgetxattr (sysfd, + GF_PROTECT_FROM_EXTERNAL_WRITES, + NULL, 0); + if ((xattrsize == -1) && ((errno == ENOATTR) || + (errno == ENODATA))) { + ret = 0; + } else { + ret = -1; + } + } + } +out: + UNLOCK (&fd->inode->lock); + return ret; +} diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c index c90bc6c438a..812cf792874 100644 --- a/xlators/storage/posix/src/posix-inode-fd-ops.c +++ b/xlators/storage/posix/src/posix-inode-fd-ops.c @@ -1579,6 +1579,15 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, _fd = pfd->fd; + ret = posix_check_internal_writes (this, fd, _fd, xdata); + if (ret < 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, + "possible overwrite from internal client, fd=%p", fd); + op_ret = -1; + op_errno = EBUSY; + goto out; + } + if (xdata) { if (dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) write_append = _gf_true; |