diff options
-rw-r--r-- | tests/xfs-dio.t | 118 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 22 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 82 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 141 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-entry.c | 2 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-metadata.c | 2 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-name.c | 3 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 7 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heald.c | 42 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.c | 9 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 3 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 6 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-handle.c | 14 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix-helpers.c | 64 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix.c | 178 |
15 files changed, 669 insertions, 24 deletions
diff --git a/tests/xfs-dio.t b/tests/xfs-dio.t new file mode 100644 index 00000000000..c8cabd5766b --- /dev/null +++ b/tests/xfs-dio.t @@ -0,0 +1,118 @@ +#!/bin/bash + +. $(dirname $0)/include.rc +. $(dirname $0)/volume.rc + +write_data () { + path=$1 + shift + echo "$@" > $path +} + +create_index_entry () { + local brick=$1 + local gfid_str=$(gf_get_gfid_xattr $brick/$2) + local gfid_path=$(gf_gfid_xattr_to_str $gfid_str) + local xop_file=$(ls $brick/.glusterfs/indices/xattrop/xattrop-* \ + | tail -n1) + ln $xop_file $brick/.glusterfs/indices/xattrop/$gfid_path + setfattr -n trusted.glusterfs.validate-status -v suspect $brick/$2 +} + +get_vstatus () { + getfattr --name trusted.glusterfs.validate-status --only-values $1 \ + 2> /dev/null +} + +trap cleanup EXIT + +TEST glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} + +# Comment out the following line to see the "argh" and "blah" tests at the end +# fail. That's because normal self-heal can't deal with this particular +# condition. To do that, we must check the actual data (OK, checksums). That's +# expensive, but if there's corruption below us - e.g. filesystem bug, flaky +# disk - then it's what we have to do. +TEST $CLI volume set $V0 cluster.shd-validate-data on + +TEST $CLI volume start $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +TEST $GFS -s $H0 --volfile-id=$V0 $M0 +TEST mkdir $M0/xyz +TEST write_data $M0/file-ok hello +TEST write_data $M0/file-use0 hello +TEST write_data $M0/xyz/file-use1 hello +TEST write_data $M0/file-bad hello +TEST umount $M0 + +# Corrupt a bunch of data. +TEST write_data $B0/${V0}2/file-use0 'argh!' +TEST write_data $B0/${V0}0/xyz/file-use1 'blah!' +TEST write_data $B0/${V0}1/file-bad 'diffX' +TEST write_data $B0/${V0}2/file-bad 'diffY' + +# Add the files to their indices. +TEST create_index_entry $B0/${V0}0 file-ok +TEST create_index_entry $B0/${V0}1 file-ok +TEST create_index_entry $B0/${V0}2 file-ok +TEST create_index_entry $B0/${V0}0 file-use0 +TEST create_index_entry $B0/${V0}1 file-use0 +TEST create_index_entry $B0/${V0}2 file-use0 +TEST create_index_entry $B0/${V0}0 xyz/file-use1 +TEST create_index_entry $B0/${V0}1 xyz/file-use1 +TEST create_index_entry $B0/${V0}2 xyz/file-use1 +TEST create_index_entry $B0/${V0}0 file-bad +TEST create_index_entry $B0/${V0}1 file-bad +TEST create_index_entry $B0/${V0}2 file-bad + +# Time to see what we can do. +TEST $CLI volume heal $V0 + +# These files are not marked in the normal way as needing heal (that's kind of +# the whole problem) so heal counts aren't useful. There are only a few tiny +# files, so just wait a few seconds for the heal to complete. +sleep 5 + +# Test the contents of the files. +EXPECT hello cat $B0/${V0}0/file-ok +EXPECT hello cat $B0/${V0}1/file-ok +EXPECT hello cat $B0/${V0}2/file-ok +EXPECT hello cat $B0/${V0}0/file-use0 +EXPECT hello cat $B0/${V0}1/file-use0 +EXPECT hello cat $B0/${V0}2/file-use0 +EXPECT hello cat $B0/${V0}0/xyz/file-use1 +EXPECT hello cat $B0/${V0}1/xyz/file-use1 +EXPECT hello cat $B0/${V0}2/xyz/file-use1 +# This was in three-way split brain, so the replicas should still diverge. +EXPECT hello cat $B0/${V0}0/file-bad +EXPECT diffX cat $B0/${V0}1/file-bad +EXPECT diffY cat $B0/${V0}2/file-bad + +# Now test validation states. +EXPECT clean get_vstatus $B0/${V0}0/file-ok +EXPECT clean get_vstatus $B0/${V0}1/file-ok +EXPECT clean get_vstatus $B0/${V0}2/file-ok +EXPECT clean get_vstatus $B0/${V0}0/file-use0 +EXPECT clean get_vstatus $B0/${V0}1/file-use0 +EXPECT repaired get_vstatus $B0/${V0}2/file-use0 +EXPECT repaired get_vstatus $B0/${V0}0/xyz/file-use1 +EXPECT clean get_vstatus $B0/${V0}1/xyz/file-use1 +EXPECT clean get_vstatus $B0/${V0}2/xyz/file-use1 +EXPECT suspect get_vstatus $B0/${V0}0/file-bad +EXPECT suspect get_vstatus $B0/${V0}1/file-bad +EXPECT suspect get_vstatus $B0/${V0}2/file-bad + +print_summary () { + for f in file-ok file-bad file-use0 file-use1; do + echo "=== FILE $f" + find $B0/ -name $f | xargs grep -E . + find $B0/ -name $f | xargs getfattr -d -e text \ + -m trusted.glusterfs.validate-status + done + echo "=== ORPHANS" + find $B0 -name '*.orig' | xargs grep -E . + find $B0 -name '*.link' | xargs ls -l +} + +#print_summary diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 7425b6688f2..f5210bd7d5d 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1267,7 +1267,7 @@ afr_inode_refresh_do (call_frame_t *frame, xlator_t *this) return 0; } - ret = afr_xattr_req_prepare (this, xdata); + ret = afr_xattr_req_prepare (this, xdata, _gf_false); if (ret != 0) { dict_unref (xdata); afr_inode_refresh_done (frame, this, -ret); @@ -1360,7 +1360,7 @@ afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, int -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req) +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, gf_boolean_t checksum) { int i = 0; afr_private_t *priv = NULL; @@ -1397,6 +1397,19 @@ afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req) "Unable to set ancestry path key in dict "); } + if (checksum) { + ret = dict_set_int32 (xattr_req, "get-checksum", 1); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "Unable to set get-checksum in dict "); + } + ret = dict_set_int32 (xattr_req, "trusted.glusterfs.validate-status", 1); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "Unable to set validate-status in dict "); + } + } + return ret; } @@ -1415,7 +1428,7 @@ afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, if (xattr_req && (xattr_req != local->xattr_req)) dict_copy (xattr_req, local->xattr_req); - ret = afr_xattr_req_prepare (this, local->xattr_req); + ret = afr_xattr_req_prepare (this, local->xattr_req, _gf_false); ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); if (ret < 0) { @@ -5906,7 +5919,8 @@ afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode, replies = alloca0 (sizeof (*replies) * priv->child_count); - ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); + ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies, + priv->shd_validate_data); if (ret) goto out; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 800cf9705c9..5a9ab795a94 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -298,6 +298,51 @@ afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, return 0; } +static int +afr_selfheal_vstatus_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = frame->local; + + syncbarrier_wake (&local->barrier); + return 0; +} + +void +afr_selfheal_update_vstatus (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *targets, char *new_status) +{ + loc_t loc = {0, }; + dict_t *xattr; + + loc.inode = inode_ref (inode); + + xattr = dict_new (); + if (!xattr) { + gf_log (this->name, GF_LOG_WARNING, + "unable to allocate validate-status for %s", + uuid_utoa (inode->gfid)); + goto done; + } + + if (dict_set_str (xattr, "trusted.glusterfs.validate-status", + new_status) != 0) { + gf_log (this->name, GF_LOG_WARNING, + "couldn't clear validate-status for %s", + uuid_utoa (inode->gfid)); + goto done; + } + + AFR_ONLIST (targets, frame, afr_selfheal_vstatus_cbk, setxattr, + &loc, xattr, 0, NULL); + +done: + if (xattr) { + dict_unref (xattr); + } + loc_wipe (&loc); +} + void afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count) @@ -1379,7 +1424,7 @@ afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, if (xattr) dict_copy (xattr, xattr_req); - if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + if (afr_xattr_req_prepare (frame->this, xattr_req, _gf_false) != 0) { dict_destroy (xattr_req); return NULL; } @@ -1406,10 +1451,11 @@ afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, return inode; } -int +static int afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, - uuid_t gfid, struct afr_reply *replies, - unsigned char *discover_on) + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on, + gf_boolean_t checksum) { loc_t loc = {0, }; dict_t *xattr_req = NULL; @@ -1423,7 +1469,7 @@ afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, if (!xattr_req) return -ENOMEM; - if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + if (afr_xattr_req_prepare (frame->this, xattr_req, checksum) != 0) { dict_destroy (xattr_req); return -ENOMEM; } @@ -1444,14 +1490,15 @@ afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, int afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, - uuid_t gfid, struct afr_reply *replies) + uuid_t gfid, struct afr_reply *replies, + gf_boolean_t checksum) { afr_private_t *priv = NULL; priv = frame->this->private; return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies, - priv->child_up); + priv->child_up, checksum); } unsigned int @@ -1865,7 +1912,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, replies = alloca0 (sizeof (*replies) * priv->child_count); - ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); + ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies, _gf_false); if (ret) goto out; @@ -1983,6 +2030,22 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, if (data_selfheal) *data_selfheal = _gf_true; } + + if (priv->shd_validate_data && data_selfheal && !*data_selfheal) { + if (IA_ISREG (replies[i].poststat.ia_type)) { + gf_log (this->name, GF_LOG_INFO, + "forcing data self-heal on %s", + uuid_utoa (replies[i].poststat.ia_gfid)); + /* + * This will force our caller (e.g. + * afr_selfheal_do) to call afr_selfheal_data, + * even though it might otherwise think + * everything looks OK. From there, we'll do a + * more thorough inspection including checksums. + */ + *data_selfheal = _gf_true; + } + } } if (valid_cnt > 0 && link_inode) { @@ -1999,7 +2062,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, ret = 0; out: if (inode) - inode_unref (inode); + inode_unref (inode); if (replies) afr_replies_wipe (replies, priv->child_count); @@ -2140,7 +2203,6 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid) &data_selfheal, &metadata_selfheal, &entry_selfheal); - if (ret) goto out; diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index c1e945bfd82..894c8e68f25 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -587,6 +587,135 @@ out: return source; } +static int +afr_move_aside (call_frame_t *frame, xlator_t *this, inode_t *inode, int i) +{ + afr_private_t *priv = this->private; + dict_t *xattr = NULL; + int ret = -1; + loc_t loc = {0, }; + + loc.inode = inode_ref (inode); + + xattr = dict_new (); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "failed to alloc move-aside dict for %s on child %d", + uuid_utoa (inode->gfid), i); + goto done; + } + + if (dict_set_str (xattr, "trusted.move-aside", "please") != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set move-aside xattr for %s on child %d", + uuid_utoa (inode->gfid), i); + goto done; + } + + if (syncop_setxattr (priv->children[i], &loc, xattr, 0, + NULL, NULL) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to send move-aside fop for %s on child %d", + uuid_utoa (inode->gfid), i); + goto done; + } + + ret = 0; + +done: + if (xattr) { + dict_unref (xattr); + } + loc_wipe (&loc); + + return ret; +} + +static void +afr_handle_validation (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + struct afr_reply *replies) +{ + afr_private_t *priv = this->private; + uint32_t *values; + int i; + int same_as[2] = {0, 0}; + char *vstatus; + + if (!priv->shd_validate_data) { + return; + } + + values = alloca0 (sizeof (*values) * priv->child_count); + for (i = 0; i < priv->child_count; ++i) { + if (!replies[i].xdata) { + gf_log (this->name, GF_LOG_DEBUG, + "no xdata for child %d", i); + return; + } + if (dict_get_str (replies[i].xdata, + "trusted.glusterfs.validate-status", + &vstatus) != 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no validate-status for child %d", i); + return; + } + if (strncmp (vstatus, "suspect", 7) != 0) { + gf_log (this->name, GF_LOG_DEBUG, + "validate-status = %s for child %d", vstatus, i); + return; + } + if (dict_get_uint32 (replies[i].xdata, "checksum", &values[i]) != 0) { + return; + } + gf_log (this->name, GF_LOG_DEBUG, + "checksum for child %d is 0x%x", i, values[i]); + } + + /* + * Let's take a shortcut here by looking only for a single odd + * man out instead of a more generalized minority. To do this, + * we only need to compare the third item onward to (at most) + * the first two, and we only need two counters. There's all + * sorts of ways we could optimize this implementation, but + * there's little left to be saved. + */ + for (i = 0; i < priv->child_count; ++i) { + same_as[0] += (values[i] == values[0]); + same_as[1] += (values[i] == values[1]); + } + if (same_as[0] == priv->child_count) { + gf_log (this->name, GF_LOG_DEBUG, "everything's OK"); + afr_selfheal_update_vstatus (frame, this, inode, + sources, "clean"); + } else if (same_as[0] == (priv->child_count - 1)) { + gf_log (this->name, GF_LOG_DEBUG, "odd man out, use 0"); + for (i = 0; i < priv->child_count; ++i) { + if (values[i] != values[0]) { + sources[i] = 0; + sinks[i] = 1; + afr_move_aside (frame, this, inode, i); + } + } + } else if (same_as[1] == (priv->child_count - 1)) { + gf_log (this->name, GF_LOG_DEBUG, "odd man out, use 1"); + for (i = 0; i < priv->child_count; ++i) { + if (values[i] != values[1]) { + sources[i] = 0; + sinks[i] = 1; + afr_move_aside (frame, this, inode, i); + } + } + } else { + gf_log (this->name, GF_LOG_WARNING, "three-way split on %s", + uuid_utoa (inode->gfid)); + for (i = 0; i < priv->child_count; ++i) { + sources[i] = 0; + sinks[i] = 1; + } + } +} + /* * __afr_selfheal_data_prepare: * @@ -612,7 +741,7 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, priv = this->private; ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid, - replies); + replies, priv->shd_validate_data); if (ret) return ret; @@ -625,6 +754,8 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, if (ret) return ret; + afr_handle_validation (frame, this, inode, sources, sinks, replies); + /* Initialize the healed_sinks[] array optimistically to the intersection of to-be-healed (i.e sinks[]) and the list of servers which are up (i.e locked_on[]). @@ -749,6 +880,14 @@ restore_time: sources, sinks, healed_sinks, undid_pending, AFR_DATA_TRANSACTION, locked_replies, data_lock); + + if (priv->shd_validate_data) { + afr_selfheal_update_vstatus (frame, this, fd->inode, + healed_sinks, "repaired"); + afr_selfheal_update_vstatus (frame, this, fd->inode, + sources, "clean"); + } + skip_undo_pending: afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0, data_lock); diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 25f8ea313aa..e0a82426a33 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -489,7 +489,7 @@ __afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, priv = this->private; ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid, - replies); + replies, _gf_false); if (ret) return ret; diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 4570ace7ef7..85dbdf2976e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -336,7 +336,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i priv = this->private; ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid, - replies); + replies, _gf_false); if (ret) return ret; diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c index 8e5546a702f..db78ef81804 100644 --- a/xlators/cluster/afr/src/afr-self-heal-name.c +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -796,7 +796,8 @@ __afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *paren replies = alloca0 (priv->child_count * sizeof(*replies)); - ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies); + ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies, + _gf_false); if (ret) goto out; diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 0a3d6482ca3..b33db59b50f 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -143,7 +143,8 @@ afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, int afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, - uuid_t gfid, struct afr_reply *replies); + uuid_t gfid, struct afr_reply *replies, + gf_boolean_t checksum); inode_t * afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, @@ -165,6 +166,10 @@ int afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, afr_transaction_type type, int *dirty, int **matrix); +void +afr_selfheal_update_vstatus (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *targets, char *new_status); + int afr_sh_generic_fop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *pre, diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 26f4a80777f..9c4f3ec81f6 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -315,8 +315,8 @@ afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent, return ret; } -int -afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid) +static int +_afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid) { int ret = 0; eh_t *eh = NULL; @@ -377,6 +377,44 @@ out: } +int +afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid) +{ + afr_private_t *priv = healer->this->private; + int ret = _afr_shd_selfheal (healer, child, gfid); + + /* + * You are not expected to understand this code. OK, sorry, it's a + * very old UNIX meme. I've been waiting years for an appropriate time + * to use it, and this seems as good as it's going to get. If it makes + * you feel any better, the reason I don't expect you to understand + * this code is that I don't understand it either and therefore can't + * explain it. + * + * What's going on here is that we only call afr_shd_zero_xattrop for a + * return value of two, which non-obviously means that no heal was + * deemed necessary. However, we made it seem necessary *only* because + * of data validation, so we skipped the part where we'd return that + * value normally. It was only later, and several layers deeper in the + * call hierarchy, that we realized everything was OK after all. + * Expecting to return a two at that point, and have it survive all the + * intervening layers, and not have any other untoward side effects, + * would require more optimism about this code than I've ever felt. + * Changing it here isn't entirely without risk either, but at least + * the side effects this way are easier to reason about. + * + * You might well wonder how the index entry ever gets removed in the + * other cases. I wonder too. Observation says that it does, and + * that's good enough. It's a big world, with many other mysteries in + * it. + */ + if (priv->shd_validate_data && (ret >= 0)) { + ret = 2; + } + + return ret; +} + void afr_shd_sweep_prepare (struct subvol_healer *healer) { diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 86f667116af..f291626fff9 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -167,6 +167,9 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("pgfid-self-heal", priv->pgfid_self_heal, options, bool, out); + GF_OPTION_RECONF ("shd-validate-data", priv->shd_validate_data, + options, bool, out); + GF_OPTION_RECONF ("data-self-heal-window-size", priv->data_self_heal_window_size, options, uint32, out); @@ -426,6 +429,8 @@ init (xlator_t *this) GF_OPTION_INIT ("pgfid-self-heal", priv->pgfid_self_heal, bool, out); + GF_OPTION_INIT ("shd-validate-data", priv->shd_validate_data, bool, out); + GF_OPTION_INIT ("background-self-heal-count", priv->background_self_heal_count, uint32, out); @@ -1112,5 +1117,9 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_BOOL, .default_value = "off", }, + { .key = {"shd-validate-data"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index b60822d0ca9..3314f865781 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -101,6 +101,7 @@ typedef struct _afr_private { unsigned char *child_up; int64_t *child_latency; gf_boolean_t pgfid_self_heal; + gf_boolean_t shd_validate_data;; unsigned char *local; char **pending_key; @@ -1101,7 +1102,7 @@ int afr_final_errno (afr_local_t *local, afr_private_t *priv); int -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req); +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, gf_boolean_t checksum); void afr_fix_open (fd_t *fd, xlator_t *this); diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index d8c81900b6f..985490892f3 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1320,6 +1320,12 @@ struct volopt_map_entry glusterd_volopt_map[] = { .description = "Use PGFID attribute if available to remediate " "failed heals." }, + { .key = "cluster.shd-validate-data", + .voltype = "cluster/replicate", + .op_version = 2, + .flags = OPT_FLAG_CLIENT_OPT, + .description = "Check data as well as flags for self-heal." + }, /* stripe xlator options */ { .key = "cluster.stripe-block-size", diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c index da0bca6e41c..fb112d8172d 100644 --- a/xlators/storage/posix/src/posix-handle.c +++ b/xlators/storage/posix/src/posix-handle.c @@ -26,6 +26,8 @@ #include "compat-errno.h" +#define GF_ORPHAN_PATH "orphans" + inode_t * posix_resolve (xlator_t *this, inode_table_t *itable, inode_t *parent, char *bname, struct iatt *iabuf) @@ -515,7 +517,7 @@ posix_handle_init (xlator_t *this) } handle_pfx = alloca (priv->base_path_length + 1 + strlen (GF_HIDDEN_PATH) - + 1); + + 1 + strlen (GF_ORPHAN_PATH) + 1); sprintf (handle_pfx, "%s/%s", priv->base_path, GF_HIDDEN_PATH); @@ -599,6 +601,16 @@ posix_handle_init (xlator_t *this) break; } + sprintf (handle_pfx, "%s/%s/%s", priv->base_path, GF_HIDDEN_PATH, + GF_ORPHAN_PATH); + if (mkdir (handle_pfx, 0700) < 0) { + if (errno != EEXIST) { + gf_log (this->name, GF_LOG_WARNING, + "failed to create orphans directory (%s)", + strerror (errno)); + } + } + return 0; } diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index 6bd6bb1a0bf..5c7cca9fbac 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -18,6 +18,7 @@ #include <ftw.h> #include <sys/stat.h> #include <signal.h> +#include <openssl/md5.h> #ifdef HAVE_SYS_ACL_H #ifdef HAVE_ACL_LIBACL_H /* for acl_to_any_text() */ @@ -239,7 +240,8 @@ _posix_xattr_get_set_from_backend (posix_xattr_filler_t *filler, char *key) } value[xattr_size] = '\0'; - ret = dict_set_bin (filler->xattr, key, value, xattr_size); + ret = dict_set_bin (filler->xattr, key, + value, xattr_size); if (ret < 0) { if (filler->real_path) gf_msg_debug (filler->this->name, 0, @@ -798,6 +800,45 @@ out: return; } +static int32_t +_handle_cksum (const char *path, uint32_t *result) +{ + int fd; + size_t bytes; + unsigned char buf[65536]; + unsigned char cksum[MD5_DIGEST_LENGTH] = {0, }; + MD5_CTX mdctx; + + fd = open (path, O_RDONLY); + if (fd < 0) { + return -1; + } + + MD5_Init (&mdctx); + + for (;;) { + bytes = read (fd, buf, sizeof (buf)); + if (bytes < 0) { + close (fd); + return -2; + } + if (bytes == 0) { + break; + } + MD5_Update (&mdctx, buf, bytes); + + if (bytes < sizeof (buf)) { + break; + } + } + + MD5_Final (cksum, &mdctx); + + close (fd); + *result = *((uint32_t *)cksum); + return 0; +} + dict_t * posix_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, fd_t *fd, int fdnum, dict_t *xattr_req, struct iatt *buf) @@ -805,12 +846,19 @@ posix_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, fd_t *fd, dict_t *xattr = NULL; posix_xattr_filler_t filler = {0, }; gf_boolean_t list = _gf_false; + gf_boolean_t do_cksum = _gf_false; + uint32_t cksum_val; if (dict_get (xattr_req, "list-xattr")) { dict_del (xattr_req, "list-xattr"); list = _gf_true; } + if (dict_get (xattr_req, "get-checksum")) { + dict_del (xattr_req, "get-checksum"); + do_cksum = _gf_true; + } + xattr = dict_new (); if (!xattr) { goto out; @@ -828,6 +876,20 @@ posix_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, fd_t *fd, if (list) _handle_list_xattr (xattr_req, real_path, fdnum, &filler); + if (do_cksum) { + if (_handle_cksum (real_path, &cksum_val) == 0) { + gf_log (this->name, GF_LOG_INFO, + "generated checksum %x for %s", cksum_val, real_path); + if (dict_set_uint32 (xattr, "checksum", cksum_val) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not store checksum for %s", real_path); + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "could not get checksum for %s", real_path); + } + } + out: return xattr; } diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 32b961b680d..b461d4ea4a1 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -3903,6 +3903,173 @@ map_xattr_flags(int flags) } #endif +static int +posix_copy_file (char *src, char *dst) +{ + int ifd = -1; + int ofd = -1; + char buf[65536]; + ssize_t ibytes; + ssize_t obytes; + int ret; + + ifd = open (src, O_RDONLY); + if (ifd < 0) { + ret = errno; + gf_log (THIS->name, GF_LOG_ERROR, + "opening source file %s failed (%s)", + src, strerror (ret)); + goto done; + } + + ofd = open (dst, O_WRONLY|O_CREAT, 0666); + if (ofd < 0) { + ret = errno; + gf_log (THIS->name, GF_LOG_ERROR, + "opening destination file %s failed (%s)", + dst, strerror (ret)); + goto done; + } + + for (;;) { + ibytes = read (ifd, buf, sizeof (buf)); + if (ibytes < 0) { + ret = errno; + gf_log (THIS->name, GF_LOG_ERROR, + "reading source file %s failed (%s)", + src, strerror (ret)); + goto done; + } + if (ibytes == 0) { + break; + } + obytes = write (ofd, buf, ibytes); + if (obytes < ibytes) { + ret = errno; + if (obytes < 0) { + gf_log (THIS->name, GF_LOG_ERROR, + "writing destination file %s failed (%s)", + dst, strerror (ret)); + } else { + gf_log (THIS->name, GF_LOG_ERROR, + "only wrote %zd/%zd bytes from %s to %s", + obytes, ibytes, src, dst); + } + goto done; + } + } + + if (fsync (ofd) < 0) { + gf_log (THIS->name, GF_LOG_WARNING, + "fsync failed copying %s (%s)", + dst, strerror (errno)); + } + ret = 0; + +done: + if (ifd >= 0) { + close (ifd); + } + if (ofd >= 0) { + close (ofd); + } + return ret; +} + +static char orphan_file_pattern[] = "%s/.glusterfs/orphans/%s.%s"; +static char orphan_link_pattern[] = "../..%s"; + +static int32_t +posix_move_aside (call_frame_t *frame, xlator_t *this, inode_t *inode) +{ + struct posix_private *priv = this->private; + char *rel_path = NULL; + size_t my_str_len = 1; /* Just the NUL */ + char *src_path; + char *dst_path; + int32_t op_errno = 0; + struct iatt stbuf; + char *link_tgt; + + if (inode_path (inode, NULL, &rel_path) <= 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not get move-aside path for %s", + uuid_utoa (inode->gfid)); + op_errno = ENOENT; + goto done; + } + + my_str_len = strlen (priv->base_path) + strlen (rel_path) + 1; + src_path = alloca (my_str_len); + sprintf (src_path, "%s%s", priv->base_path, rel_path); + + op_errno = posix_pstat (this, NULL, src_path, &stbuf); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, + "move-aside stat failed for %s", src_path); + goto done; + } + + /* 36 for a GFID, and 4 for the orig/link extension */ + my_str_len = sizeof (orphan_file_pattern) + 40; + dst_path = alloca (my_str_len); + sprintf (dst_path, orphan_file_pattern, priv->base_path, + uuid_utoa (inode->gfid), "orig"); + + gf_log (this->name, GF_LOG_INFO, + "move-aside: src = %s, dst = %s", src_path, dst_path); + +#if defined(MOVING_THE_FILE_WORKS) + /* + * This is how we should really do things, to avoid the overhead of + * copying (potentially large) amounts of data. Unfortunately, if the + * file and all of its xattrs aren't there, the self-heal that's the + * whole point of our little exercise doesn't work. The same might be + * true of the .glusterfs handle. Until some magic formula can be + * found, our most expedient choice is to *copy* the file instead of + * moving it. + */ + if (sys_link (src_path, dst_path) < 0) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "move-aside link failed for %s", src_path); + goto done; + } + + if (posix_unlink_gfid_handle_and_entry (this, src_path, &stbuf, + &op_errno) < 0) { + gf_log (this->name, GF_LOG_ERROR, + "move-aside unlink failed for %s", src_path); + goto done; + } +#else + op_errno = posix_copy_file (src_path, dst_path); + if (op_errno != 0) { + /* Errors would have been reported already. */ + goto done; + } +#endif + + sprintf (dst_path, orphan_file_pattern, priv->base_path, + uuid_utoa (inode->gfid), "link"); + my_str_len = sizeof (orphan_link_pattern) + strlen (rel_path); + link_tgt = alloca (my_str_len); + sprintf (link_tgt, orphan_link_pattern, rel_path); + + if (sys_symlink (link_tgt, dst_path) < 0) { + /* This is deliberately not fatal. */ + gf_log (this->name, GF_LOG_WARNING, + "move-aside could not link %s to %s", + dst_path, link_tgt); + } + +done: + if (rel_path) { + GF_FREE (rel_path); + } + return op_errno; +} + int32_t posix_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, int flags, dict_t *xdata) @@ -3948,6 +4115,17 @@ posix_setxattr (call_frame_t *frame, xlator_t *this, #else filler.flags = flags; #endif + + if (dict_get (dict, "trusted.move-aside")) { + dict_del (dict, "trusted.move-aside"); + op_ret = posix_move_aside (frame, this, loc->inode); + if (op_ret != 0) { + op_errno = abs (op_ret); + op_ret = -1; + goto out; + } + } + op_ret = dict_foreach (dict, _handle_setxattr_keyvalue_pair, &filler); if (op_ret < 0) { |