diff options
author | Shyam <srangana@redhat.com> | 2014-11-12 10:12:13 -0500 |
---|---|---|
committer | Raghavendra G <rgowdapp@redhat.com> | 2015-02-17 08:07:43 -0800 |
commit | 7c6da2f7ceea2956197641b6cdb1e2f79cdb063e (patch) | |
tree | b55163a9710b0613ec781bf9994086aae8d082be | |
parent | de6f4ce0dd894c78632bf7b2ebc00d8044fba688 (diff) |
cluster/dht: Fix dht_link to follow files under migration
Currently if a file is under migration, a hardlink to that file
is lost post migration of the file. This is due to the fact that
the hard link is created against the cached subvol of the source
and as the source is under migration, it shifts to a linkto file
post migration. Thus losing the hardlink.
This change follows the stat information that triggers a phase1/2
detection for a file under migration, to create the link on the new
subvol that the source file is migrating to. Thereby preserving the
hard link post migration.
NOTES:
The test case added create a ~1GB file, so that we can catch the file
during migration, smaller files may not capture this state and the
test may fail.
Even if migration of the file fails, we would only be left with stale
linkto files on the subvol that the source was migrating to, which is
not a problem.
This change would create a double linkto, i.e new target hashed subvol
would point to old source cached subol, which would point to the real
cached subvol. This double redirection although not handled directly in
DHT, works as lookup searches everywhere on hitting linkto files. The
downside is that it never heals the new target hashed subvol linkto
file, which is another bug to be resolved (does not cause functional
impact).
Change-Id: I871e6885b15e65e05bfe70a0b0180605493cb534
BUG: 1161311
Signed-off-by: Shyam <srangana@redhat.com>
Reviewed-on: http://review.gluster.org/9105
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: N Balachandran <nbalacha@redhat.com>
Reviewed-by: susant palai <spalai@redhat.com>
Reviewed-by: venkatesh somyajulu <vsomyaju@redhat.com>
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
Tested-by: Raghavendra G <rgowdapp@redhat.com>
-rwxr-xr-x | tests/bugs/bug-1161311.t | 129 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 143 |
2 files changed, 253 insertions, 19 deletions
diff --git a/tests/bugs/bug-1161311.t b/tests/bugs/bug-1161311.t new file mode 100755 index 00000000000..52ed1555c20 --- /dev/null +++ b/tests/bugs/bug-1161311.t @@ -0,0 +1,129 @@ +#!/bin/bash + +# This tests for hard link preservation for files that are linked, when the +# file is undergoing migration + +# --- Improvements and other tests --- +## Fail rebalance of the large file for which links are created during P1/2 +### phases of migration +## Start with multiple hard links to the file and then create more during P1/2 +### phases of migration +## Test the same with NFS as the mount rather than FUSE +## Create links when file is under P2 of migration specifically +## Test with quota, to error out during hard link creation (if possible) + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +checksticky () { + i=0; + while [ ! -k $1 ]; do + sleep 1 + i=$((i+1)); + # Try for 10 seconds to get the sticky bit state + # else fail the test, as we may never see it + if [[ $i == 10 ]]; then + return $i + fi + echo "Waiting... $i" + done + echo "Done... got out @ $i" + return 0 +} + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +TEST $CLI volume create $V0 $H0:$B0/${V0}{1..3}; + +EXPECT "$V0" volinfo_field $V0 'Volume Name'; +EXPECT 'Created' volinfo_field $V0 'Status'; +EXPECT '3' brick_count $V0 + +TEST $CLI volume start $V0; +EXPECT 'Started' volinfo_field $V0 'Status'; + +## Mount FUSE with caching disabled (read-write) +TEST glusterfs -s $H0 --volfile-id $V0 $M0; + +# Create a directories to hold the links +TEST mkdir $M0/dir1 +TEST mkdir -p $M0/dir2/dir3 + +# Create a large file (1GB), so that rebalance takes time +dd if=/dev/urandom of=$M0/dir1/FILE2 bs=64k count=10240 + +# Rename the file to create a linkto, for rebalance to +# act on the file +## FILE1 and FILE2 hashes are, 678b1c4a e22c1ada, so they fall +## into separate bricks when brick count is 3 +TEST mv $M0/dir1/FILE2 $M0/dir1/FILE1 + +# unmount and remount the volume +TEST umount $M0 +TEST glusterfs -s $H0 --volfile-id $V0 $M0; + +# Start the rebalance +TEST $CLI volume rebalance $V0 start force + +# Wait for FILE to get the sticky bit on, so that file is under +# active rebalance, before creating the links +TEST checksticky $B0/${V0}3/dir1/FILE1 + +# Create the links +## FILE3 FILE5 FILE7 have hashes, c8c91469 566d26ce 22ce7eba +## Which fall into separate bricks on a 3 brick layout +cd $M0 +TEST ln ./dir1/FILE1 ./dir1/FILE7 +TEST ln ./dir1/FILE1 ./dir1/FILE5 +TEST ln ./dir1/FILE1 ./dir1/FILE3 + +TEST ln ./dir1/FILE1 ./dir2/FILE7 +TEST ln ./dir1/FILE1 ./dir2/FILE5 +TEST ln ./dir1/FILE1 ./dir2/FILE3 + +TEST ln ./dir1/FILE1 ./dir2/dir3/FILE7 +TEST ln ./dir1/FILE1 ./dir2/dir3/FILE5 +TEST ln ./dir1/FILE1 ./dir2/dir3/FILE3 +cd / + +# Ideally for this test to have done its job, the file should still be +# under migration, so check the sticky bit again +TEST checksticky $B0/${V0}3/dir1/FILE1 + +# Wait for rebalance to complete +EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" rebalance_status_field $V0 + +# Check if all files are clean and migrated right +## stat on the original file should show linkcount of 10 +linkcountsrc=$(stat -c %h $M0/dir1/FILE1) +TEST [[ $linkcountsrc == 10 ]] + +## inode and size of every file should be same as original file +inodesrc=$(stat -c %i $M0/dir1/FILE1) +TEST [[ $(stat -c %i $M0/dir1/FILE3) == $inodesrc ]] +TEST [[ $(stat -c %i $M0/dir1/FILE5) == $inodesrc ]] +TEST [[ $(stat -c %i $M0/dir1/FILE7) == $inodesrc ]] + +TEST [[ $(stat -c %i $M0/dir2/FILE3) == $inodesrc ]] +TEST [[ $(stat -c %i $M0/dir2/FILE5) == $inodesrc ]] +TEST [[ $(stat -c %i $M0/dir2/FILE7) == $inodesrc ]] + +TEST [[ $(stat -c %i $M0/dir2/dir3/FILE3) == $inodesrc ]] +TEST [[ $(stat -c %i $M0/dir2/dir3/FILE5) == $inodesrc ]] +TEST [[ $(stat -c %i $M0/dir2/dir3/FILE7) == $inodesrc ]] + +# Check, newer link creations +cd $M0 +TEST ln ./dir1/FILE1 ./FILE1 +TEST ln ./dir2/FILE3 ./FILE3 +TEST ln ./dir2/dir3/FILE5 ./FILE5 +TEST ln ./dir1/FILE7 ./FILE7 +cd / +linkcountsrc=$(stat -c %h $M0/dir1/FILE1) +TEST [[ $linkcountsrc == 14 ]] + +cleanup; diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index d396c2ee4ab..bc9d04d36f8 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -28,6 +28,8 @@ #include <libgen.h> #include <signal.h> +int dht_link2 (xlator_t *this, call_frame_t *frame, int op_ret); + int dht_aggregate (dict_t *this, char *key, data_t *value, void *data) { @@ -4490,54 +4492,156 @@ err: return 0; } - int dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - call_frame_t *prev = NULL; - dht_layout_t *layout = NULL; dht_local_t *local = NULL; - - prev = cookie; + int ret = -1; + gf_boolean_t stbuf_merged = _gf_false; + xlator_t *subvol = NULL; local = frame->local; - if (op_ret == -1) - goto out; - - layout = dht_layout_for_subvol (this, prev->this); - if (!layout) { - gf_msg_debug (this->name, 0, - "no pre-set layout for subvolume %s", - prev->this->name); - op_ret = -1; - op_errno = EINVAL; + if (op_ret == -1) { + /* No continuation on DHT inode missing errors, as we should + * then have a good stbuf that states P2 happened. We would + * get inode missing if, the file completed migrated between + * the lookup and the link call */ goto out; } + /* Update parent on success, even if P1/2 checks are positve. + * The second call on success will further update the parent */ if (local->loc.parent) { dht_inode_ctx_time_update (local->loc.parent, this, preparent, 0); dht_inode_ctx_time_update (local->loc.parent, this, postparent, 1); } - if (local->linked == _gf_true) { - local->stbuf = *stbuf; + + /* Update linkto attrs, if this is the first call and non-P2, + * if we detect P2 then we need to trust the attrs from the + * second call, not the first */ + if (local->linked == _gf_true && + ((local->call_cnt == 1 && !IS_DHT_MIGRATION_PHASE2 (stbuf)) + || (local->call_cnt != 1 && + IS_DHT_MIGRATION_PHASE2 (&local->stbuf)))) { + dht_iatt_merge (this, &local->stbuf, stbuf, NULL); + stbuf_merged = _gf_true; dht_linkfile_attr_heal (frame, this); } + + /* No further P1/2 checks if we are in the second iteration of + * the call */ + if (local->call_cnt != 1) { + goto out; + } else { + /* Preserve the return values, in case the migration decides + * to recreate the link on the same subvol that the current + * hased for the link was created on. */ + dht_iatt_merge (this, &local->preparent, + preparent, NULL); + dht_iatt_merge (this, &local->postparent, + postparent, NULL); + if (!stbuf_merged) { + dht_iatt_merge (this, &local->stbuf, + stbuf, NULL); + stbuf_merged = _gf_true; + } + + local->inode = inode_ref (inode); + } + + local->op_errno = op_errno; + local->rebalance.target_op_fn = dht_link2; + /* Check if the rebalance phase2 is true */ + if (IS_DHT_MIGRATION_PHASE2 (stbuf)) { + ret = dht_inode_ctx_get1 (this, local->loc.inode, &subvol); + if (!subvol) { + /* Phase 2 of migration */ + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } else { + dht_link2 (this, frame, 0); + return 0; + } + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (stbuf)) { + ret = dht_inode_ctx_get1 (this, local->loc.inode, &subvol); + if (subvol) { + dht_link2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } out: DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, - postparent, NULL); + + DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, + preparent, postparent, NULL); return 0; } int +dht_link2 (xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + int op_errno = EINVAL; + + local = frame->local; + if (!local) + goto err; + + op_errno = local->op_errno; + if (op_ret == -1) + goto err; + + dht_inode_ctx_get1 (this, local->loc.inode, &subvol); + if (!subvol) { + subvol = local->cached_subvol; + if (!subvol) { + op_errno = EINVAL; + goto err; + } + } + + /* Second call to create link file could result in EEXIST as the + * first call created the linkto in the currently + * migrating subvol, which could be the new hashed subvol */ + if (local->link_subvol == subvol) { + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (link, frame, 0, 0, local->inode, + &local->stbuf, &local->preparent, + &local->postparent, NULL); + + return 0; + } + + local->call_cnt = 2; + + STACK_WIND (frame, dht_link_cbk, subvol, subvol->fops->link, + &local->loc, &local->loc2, NULL); + + return 0; +err: + DHT_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); + + return 0; +} + +int dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf, @@ -4588,6 +4692,7 @@ dht_link (call_frame_t *frame, xlator_t *this, goto err; } + local->call_cnt = 1; cached_subvol = local->cached_subvol; if (!cached_subvol) { |