summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeff Darcy <jdarcy@fb.com>2017-05-18 10:22:12 -0700
committerJeff Darcy <jeff@pl.atyp.us>2017-09-12 21:35:20 +0000
commitcf8a4d17fe9133f050560c4d655255a003185fe5 (patch)
tree25703f1d827cc3f74770d8a51118b660f2769cc5
parent4e880ed12b989b36387ea79179b5f14d8cee9491 (diff)
Remediation for XFS/DIO corruption problem.
This adds a new volume option, shd-validate-data. When set, the self-heal code will fetch checksums for regular files along with all the usual xattrs. If the file seems OK but the checksums show a data mismatch, and if there is only one replica that's out of step with the others, then we modify the source/sink calculations to force a heal from one of the agreeing replicas to the odd one out. Combined with a tool to put files into the self-heal index (being developed separately), this provides a very rudimentary kind of scrubbing functionality. Validation is now conditional on the "trusted.glusterfs.validate-status" xattr having the specific value of "suspect" to avoid redoing validation (which is expensive) as we find the same file in multiple bricks' indices. When we decide to take action, we update this xattr to "clean" for copies that were in the majority and "repaired" for the odd one out that gets clobbered. We also copy the about-to-be-clobbered copy into an "orphans" directory to facilitate analysis of corruption patterns. The data goes into ${GFID}.data there, while ${GFID}.link is a symlink to the file's old location. Porting note: this is several internal squashed together ("See Also") Differential Revision: https://phabricator.intern.facebook.com/D5092983 See Also: https://phabricator.intern.facebook.com/D5126974 See Also: https://phabricator.intern.facebook.com/D5127427 See Also: https://phabricator.intern.facebook.com/D5132804 See Also: https://phabricator.intern.facebook.com/D5209185 See Also: https://phabricator.intern.facebook.com/D5370353 Change-Id: Ie0ae18b368c408a5e47d0bf03ebac80b87b70aa9 Signed-off-by: Jeff Darcy <jdarcy@fb.com> Reviewed-on: https://review.gluster.org/18269 Reviewed-by: Jeff Darcy <jeff@pl.atyp.us> Tested-by: Jeff Darcy <jeff@pl.atyp.us> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Smoke: Gluster Build System <jenkins@build.gluster.org>
-rw-r--r--tests/xfs-dio.t118
-rw-r--r--xlators/cluster/afr/src/afr-common.c22
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c82
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c141
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c2
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c2
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c3
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h7
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c42
-rw-r--r--xlators/cluster/afr/src/afr.c9
-rw-r--r--xlators/cluster/afr/src/afr.h3
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c6
-rw-r--r--xlators/storage/posix/src/posix-handle.c14
-rw-r--r--xlators/storage/posix/src/posix-helpers.c64
-rw-r--r--xlators/storage/posix/src/posix.c178
15 files changed, 669 insertions, 24 deletions
diff --git a/tests/xfs-dio.t b/tests/xfs-dio.t
new file mode 100644
index 00000000000..c8cabd5766b
--- /dev/null
+++ b/tests/xfs-dio.t
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+. $(dirname $0)/include.rc
+. $(dirname $0)/volume.rc
+
+write_data () {
+ path=$1
+ shift
+ echo "$@" > $path
+}
+
+create_index_entry () {
+ local brick=$1
+ local gfid_str=$(gf_get_gfid_xattr $brick/$2)
+ local gfid_path=$(gf_gfid_xattr_to_str $gfid_str)
+ local xop_file=$(ls $brick/.glusterfs/indices/xattrop/xattrop-* \
+ | tail -n1)
+ ln $xop_file $brick/.glusterfs/indices/xattrop/$gfid_path
+ setfattr -n trusted.glusterfs.validate-status -v suspect $brick/$2
+}
+
+get_vstatus () {
+ getfattr --name trusted.glusterfs.validate-status --only-values $1 \
+ 2> /dev/null
+}
+
+trap cleanup EXIT
+
+TEST glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+
+# Comment out the following line to see the "argh" and "blah" tests at the end
+# fail. That's because normal self-heal can't deal with this particular
+# condition. To do that, we must check the actual data (OK, checksums). That's
+# expensive, but if there's corruption below us - e.g. filesystem bug, flaky
+# disk - then it's what we have to do.
+TEST $CLI volume set $V0 cluster.shd-validate-data on
+
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+TEST $GFS -s $H0 --volfile-id=$V0 $M0
+TEST mkdir $M0/xyz
+TEST write_data $M0/file-ok hello
+TEST write_data $M0/file-use0 hello
+TEST write_data $M0/xyz/file-use1 hello
+TEST write_data $M0/file-bad hello
+TEST umount $M0
+
+# Corrupt a bunch of data.
+TEST write_data $B0/${V0}2/file-use0 'argh!'
+TEST write_data $B0/${V0}0/xyz/file-use1 'blah!'
+TEST write_data $B0/${V0}1/file-bad 'diffX'
+TEST write_data $B0/${V0}2/file-bad 'diffY'
+
+# Add the files to their indices.
+TEST create_index_entry $B0/${V0}0 file-ok
+TEST create_index_entry $B0/${V0}1 file-ok
+TEST create_index_entry $B0/${V0}2 file-ok
+TEST create_index_entry $B0/${V0}0 file-use0
+TEST create_index_entry $B0/${V0}1 file-use0
+TEST create_index_entry $B0/${V0}2 file-use0
+TEST create_index_entry $B0/${V0}0 xyz/file-use1
+TEST create_index_entry $B0/${V0}1 xyz/file-use1
+TEST create_index_entry $B0/${V0}2 xyz/file-use1
+TEST create_index_entry $B0/${V0}0 file-bad
+TEST create_index_entry $B0/${V0}1 file-bad
+TEST create_index_entry $B0/${V0}2 file-bad
+
+# Time to see what we can do.
+TEST $CLI volume heal $V0
+
+# These files are not marked in the normal way as needing heal (that's kind of
+# the whole problem) so heal counts aren't useful. There are only a few tiny
+# files, so just wait a few seconds for the heal to complete.
+sleep 5
+
+# Test the contents of the files.
+EXPECT hello cat $B0/${V0}0/file-ok
+EXPECT hello cat $B0/${V0}1/file-ok
+EXPECT hello cat $B0/${V0}2/file-ok
+EXPECT hello cat $B0/${V0}0/file-use0
+EXPECT hello cat $B0/${V0}1/file-use0
+EXPECT hello cat $B0/${V0}2/file-use0
+EXPECT hello cat $B0/${V0}0/xyz/file-use1
+EXPECT hello cat $B0/${V0}1/xyz/file-use1
+EXPECT hello cat $B0/${V0}2/xyz/file-use1
+# This was in three-way split brain, so the replicas should still diverge.
+EXPECT hello cat $B0/${V0}0/file-bad
+EXPECT diffX cat $B0/${V0}1/file-bad
+EXPECT diffY cat $B0/${V0}2/file-bad
+
+# Now test validation states.
+EXPECT clean get_vstatus $B0/${V0}0/file-ok
+EXPECT clean get_vstatus $B0/${V0}1/file-ok
+EXPECT clean get_vstatus $B0/${V0}2/file-ok
+EXPECT clean get_vstatus $B0/${V0}0/file-use0
+EXPECT clean get_vstatus $B0/${V0}1/file-use0
+EXPECT repaired get_vstatus $B0/${V0}2/file-use0
+EXPECT repaired get_vstatus $B0/${V0}0/xyz/file-use1
+EXPECT clean get_vstatus $B0/${V0}1/xyz/file-use1
+EXPECT clean get_vstatus $B0/${V0}2/xyz/file-use1
+EXPECT suspect get_vstatus $B0/${V0}0/file-bad
+EXPECT suspect get_vstatus $B0/${V0}1/file-bad
+EXPECT suspect get_vstatus $B0/${V0}2/file-bad
+
+print_summary () {
+ for f in file-ok file-bad file-use0 file-use1; do
+ echo "=== FILE $f"
+ find $B0/ -name $f | xargs grep -E .
+ find $B0/ -name $f | xargs getfattr -d -e text \
+ -m trusted.glusterfs.validate-status
+ done
+ echo "=== ORPHANS"
+ find $B0 -name '*.orig' | xargs grep -E .
+ find $B0 -name '*.link' | xargs ls -l
+}
+
+#print_summary
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 7425b6688f2..f5210bd7d5d 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -1267,7 +1267,7 @@ afr_inode_refresh_do (call_frame_t *frame, xlator_t *this)
return 0;
}
- ret = afr_xattr_req_prepare (this, xdata);
+ ret = afr_xattr_req_prepare (this, xdata, _gf_false);
if (ret != 0) {
dict_unref (xdata);
afr_inode_refresh_done (frame, this, -ret);
@@ -1360,7 +1360,7 @@ afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
int
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req)
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, gf_boolean_t checksum)
{
int i = 0;
afr_private_t *priv = NULL;
@@ -1397,6 +1397,19 @@ afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req)
"Unable to set ancestry path key in dict ");
}
+ if (checksum) {
+ ret = dict_set_int32 (xattr_req, "get-checksum", 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to set get-checksum in dict ");
+ }
+ ret = dict_set_int32 (xattr_req, "trusted.glusterfs.validate-status", 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to set validate-status in dict ");
+ }
+ }
+
return ret;
}
@@ -1415,7 +1428,7 @@ afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this,
if (xattr_req && (xattr_req != local->xattr_req))
dict_copy (xattr_req, local->xattr_req);
- ret = afr_xattr_req_prepare (this, local->xattr_req);
+ ret = afr_xattr_req_prepare (this, local->xattr_req, _gf_false);
ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
if (ret < 0) {
@@ -5906,7 +5919,8 @@ afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode,
replies = alloca0 (sizeof (*replies) * priv->child_count);
- ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies);
+ ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies,
+ priv->shd_validate_data);
if (ret)
goto out;
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 800cf9705c9..5a9ab795a94 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -298,6 +298,51 @@ afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
return 0;
}
+static int
+afr_selfheal_vstatus_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+
+ syncbarrier_wake (&local->barrier);
+ return 0;
+}
+
+void
+afr_selfheal_update_vstatus (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *targets, char *new_status)
+{
+ loc_t loc = {0, };
+ dict_t *xattr;
+
+ loc.inode = inode_ref (inode);
+
+ xattr = dict_new ();
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unable to allocate validate-status for %s",
+ uuid_utoa (inode->gfid));
+ goto done;
+ }
+
+ if (dict_set_str (xattr, "trusted.glusterfs.validate-status",
+ new_status) != 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "couldn't clear validate-status for %s",
+ uuid_utoa (inode->gfid));
+ goto done;
+ }
+
+ AFR_ONLIST (targets, frame, afr_selfheal_vstatus_cbk, setxattr,
+ &loc, xattr, 0, NULL);
+
+done:
+ if (xattr) {
+ dict_unref (xattr);
+ }
+ loc_wipe (&loc);
+}
+
void
afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count)
@@ -1379,7 +1424,7 @@ afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
if (xattr)
dict_copy (xattr, xattr_req);
- if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
+ if (afr_xattr_req_prepare (frame->this, xattr_req, _gf_false) != 0) {
dict_destroy (xattr_req);
return NULL;
}
@@ -1406,10 +1451,11 @@ afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
return inode;
}
-int
+static int
afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode,
- uuid_t gfid, struct afr_reply *replies,
- unsigned char *discover_on)
+ uuid_t gfid, struct afr_reply *replies,
+ unsigned char *discover_on,
+ gf_boolean_t checksum)
{
loc_t loc = {0, };
dict_t *xattr_req = NULL;
@@ -1423,7 +1469,7 @@ afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode,
if (!xattr_req)
return -ENOMEM;
- if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
+ if (afr_xattr_req_prepare (frame->this, xattr_req, checksum) != 0) {
dict_destroy (xattr_req);
return -ENOMEM;
}
@@ -1444,14 +1490,15 @@ afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode,
int
afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
- uuid_t gfid, struct afr_reply *replies)
+ uuid_t gfid, struct afr_reply *replies,
+ gf_boolean_t checksum)
{
afr_private_t *priv = NULL;
priv = frame->this->private;
return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies,
- priv->child_up);
+ priv->child_up, checksum);
}
unsigned int
@@ -1865,7 +1912,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
replies = alloca0 (sizeof (*replies) * priv->child_count);
- ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies);
+ ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies, _gf_false);
if (ret)
goto out;
@@ -1983,6 +2030,22 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
if (data_selfheal)
*data_selfheal = _gf_true;
}
+
+ if (priv->shd_validate_data && data_selfheal && !*data_selfheal) {
+ if (IA_ISREG (replies[i].poststat.ia_type)) {
+ gf_log (this->name, GF_LOG_INFO,
+ "forcing data self-heal on %s",
+ uuid_utoa (replies[i].poststat.ia_gfid));
+ /*
+ * This will force our caller (e.g.
+ * afr_selfheal_do) to call afr_selfheal_data,
+ * even though it might otherwise think
+ * everything looks OK. From there, we'll do a
+ * more thorough inspection including checksums.
+ */
+ *data_selfheal = _gf_true;
+ }
+ }
}
if (valid_cnt > 0 && link_inode) {
@@ -1999,7 +2062,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
ret = 0;
out:
if (inode)
- inode_unref (inode);
+ inode_unref (inode);
if (replies)
afr_replies_wipe (replies, priv->child_count);
@@ -2140,7 +2203,6 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
&data_selfheal,
&metadata_selfheal,
&entry_selfheal);
-
if (ret)
goto out;
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index c1e945bfd82..894c8e68f25 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -587,6 +587,135 @@ out:
return source;
}
+static int
+afr_move_aside (call_frame_t *frame, xlator_t *this, inode_t *inode, int i)
+{
+ afr_private_t *priv = this->private;
+ dict_t *xattr = NULL;
+ int ret = -1;
+ loc_t loc = {0, };
+
+ loc.inode = inode_ref (inode);
+
+ xattr = dict_new ();
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to alloc move-aside dict for %s on child %d",
+ uuid_utoa (inode->gfid), i);
+ goto done;
+ }
+
+ if (dict_set_str (xattr, "trusted.move-aside", "please") != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set move-aside xattr for %s on child %d",
+ uuid_utoa (inode->gfid), i);
+ goto done;
+ }
+
+ if (syncop_setxattr (priv->children[i], &loc, xattr, 0,
+ NULL, NULL) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to send move-aside fop for %s on child %d",
+ uuid_utoa (inode->gfid), i);
+ goto done;
+ }
+
+ ret = 0;
+
+done:
+ if (xattr) {
+ dict_unref (xattr);
+ }
+ loc_wipe (&loc);
+
+ return ret;
+}
+
+static void
+afr_handle_validation (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ struct afr_reply *replies)
+{
+ afr_private_t *priv = this->private;
+ uint32_t *values;
+ int i;
+ int same_as[2] = {0, 0};
+ char *vstatus;
+
+ if (!priv->shd_validate_data) {
+ return;
+ }
+
+ values = alloca0 (sizeof (*values) * priv->child_count);
+ for (i = 0; i < priv->child_count; ++i) {
+ if (!replies[i].xdata) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no xdata for child %d", i);
+ return;
+ }
+ if (dict_get_str (replies[i].xdata,
+ "trusted.glusterfs.validate-status",
+ &vstatus) != 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no validate-status for child %d", i);
+ return;
+ }
+ if (strncmp (vstatus, "suspect", 7) != 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "validate-status = %s for child %d", vstatus, i);
+ return;
+ }
+ if (dict_get_uint32 (replies[i].xdata, "checksum", &values[i]) != 0) {
+ return;
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "checksum for child %d is 0x%x", i, values[i]);
+ }
+
+ /*
+ * Let's take a shortcut here by looking only for a single odd
+ * man out instead of a more generalized minority. To do this,
+ * we only need to compare the third item onward to (at most)
+ * the first two, and we only need two counters. There's all
+ * sorts of ways we could optimize this implementation, but
+ * there's little left to be saved.
+ */
+ for (i = 0; i < priv->child_count; ++i) {
+ same_as[0] += (values[i] == values[0]);
+ same_as[1] += (values[i] == values[1]);
+ }
+ if (same_as[0] == priv->child_count) {
+ gf_log (this->name, GF_LOG_DEBUG, "everything's OK");
+ afr_selfheal_update_vstatus (frame, this, inode,
+ sources, "clean");
+ } else if (same_as[0] == (priv->child_count - 1)) {
+ gf_log (this->name, GF_LOG_DEBUG, "odd man out, use 0");
+ for (i = 0; i < priv->child_count; ++i) {
+ if (values[i] != values[0]) {
+ sources[i] = 0;
+ sinks[i] = 1;
+ afr_move_aside (frame, this, inode, i);
+ }
+ }
+ } else if (same_as[1] == (priv->child_count - 1)) {
+ gf_log (this->name, GF_LOG_DEBUG, "odd man out, use 1");
+ for (i = 0; i < priv->child_count; ++i) {
+ if (values[i] != values[1]) {
+ sources[i] = 0;
+ sinks[i] = 1;
+ afr_move_aside (frame, this, inode, i);
+ }
+ }
+ } else {
+ gf_log (this->name, GF_LOG_WARNING, "three-way split on %s",
+ uuid_utoa (inode->gfid));
+ for (i = 0; i < priv->child_count; ++i) {
+ sources[i] = 0;
+ sinks[i] = 1;
+ }
+ }
+}
+
/*
* __afr_selfheal_data_prepare:
*
@@ -612,7 +741,7 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
priv = this->private;
ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
- replies);
+ replies, priv->shd_validate_data);
if (ret)
return ret;
@@ -625,6 +754,8 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
if (ret)
return ret;
+ afr_handle_validation (frame, this, inode, sources, sinks, replies);
+
/* Initialize the healed_sinks[] array optimistically to
the intersection of to-be-healed (i.e sinks[]) and
the list of servers which are up (i.e locked_on[]).
@@ -749,6 +880,14 @@ restore_time:
sources, sinks, healed_sinks,
undid_pending, AFR_DATA_TRANSACTION,
locked_replies, data_lock);
+
+ if (priv->shd_validate_data) {
+ afr_selfheal_update_vstatus (frame, this, fd->inode,
+ healed_sinks, "repaired");
+ afr_selfheal_update_vstatus (frame, this, fd->inode,
+ sources, "clean");
+ }
+
skip_undo_pending:
afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0,
data_lock);
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index 25f8ea313aa..e0a82426a33 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -489,7 +489,7 @@ __afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this,
priv = this->private;
ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
- replies);
+ replies, _gf_false);
if (ret)
return ret;
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 4570ace7ef7..85dbdf2976e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -336,7 +336,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i
priv = this->private;
ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
- replies);
+ replies, _gf_false);
if (ret)
return ret;
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
index 8e5546a702f..db78ef81804 100644
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -796,7 +796,8 @@ __afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *paren
replies = alloca0 (priv->child_count * sizeof(*replies));
- ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies);
+ ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies,
+ _gf_false);
if (ret)
goto out;
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 0a3d6482ca3..b33db59b50f 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -143,7 +143,8 @@ afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
int
afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
- uuid_t gfid, struct afr_reply *replies);
+ uuid_t gfid, struct afr_reply *replies,
+ gf_boolean_t checksum);
inode_t *
afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
@@ -165,6 +166,10 @@ int
afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
afr_transaction_type type, int *dirty, int **matrix);
+void
+afr_selfheal_update_vstatus (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *targets, char *new_status);
+
int
afr_sh_generic_fop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *pre,
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index 26f4a80777f..9c4f3ec81f6 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -315,8 +315,8 @@ afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent,
return ret;
}
-int
-afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)
+static int
+_afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)
{
int ret = 0;
eh_t *eh = NULL;
@@ -377,6 +377,44 @@ out:
}
+int
+afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)
+{
+ afr_private_t *priv = healer->this->private;
+ int ret = _afr_shd_selfheal (healer, child, gfid);
+
+ /*
+ * You are not expected to understand this code. OK, sorry, it's a
+ * very old UNIX meme. I've been waiting years for an appropriate time
+ * to use it, and this seems as good as it's going to get. If it makes
+ * you feel any better, the reason I don't expect you to understand
+ * this code is that I don't understand it either and therefore can't
+ * explain it.
+ *
+ * What's going on here is that we only call afr_shd_zero_xattrop for a
+ * return value of two, which non-obviously means that no heal was
+ * deemed necessary. However, we made it seem necessary *only* because
+ * of data validation, so we skipped the part where we'd return that
+ * value normally. It was only later, and several layers deeper in the
+ * call hierarchy, that we realized everything was OK after all.
+ * Expecting to return a two at that point, and have it survive all the
+ * intervening layers, and not have any other untoward side effects,
+ * would require more optimism about this code than I've ever felt.
+ * Changing it here isn't entirely without risk either, but at least
+ * the side effects this way are easier to reason about.
+ *
+ * You might well wonder how the index entry ever gets removed in the
+ * other cases. I wonder too. Observation says that it does, and
+ * that's good enough. It's a big world, with many other mysteries in
+ * it.
+ */
+ if (priv->shd_validate_data && (ret >= 0)) {
+ ret = 2;
+ }
+
+ return ret;
+}
+
void
afr_shd_sweep_prepare (struct subvol_healer *healer)
{
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 86f667116af..f291626fff9 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -167,6 +167,9 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("pgfid-self-heal", priv->pgfid_self_heal,
options, bool, out);
+ GF_OPTION_RECONF ("shd-validate-data", priv->shd_validate_data,
+ options, bool, out);
+
GF_OPTION_RECONF ("data-self-heal-window-size",
priv->data_self_heal_window_size, options,
uint32, out);
@@ -426,6 +429,8 @@ init (xlator_t *this)
GF_OPTION_INIT ("pgfid-self-heal", priv->pgfid_self_heal, bool, out);
+ GF_OPTION_INIT ("shd-validate-data", priv->shd_validate_data, bool, out);
+
GF_OPTION_INIT ("background-self-heal-count",
priv->background_self_heal_count, uint32, out);
@@ -1112,5 +1117,9 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
},
+ { .key = {"shd-validate-data"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index b60822d0ca9..3314f865781 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -101,6 +101,7 @@ typedef struct _afr_private {
unsigned char *child_up;
int64_t *child_latency;
gf_boolean_t pgfid_self_heal;
+ gf_boolean_t shd_validate_data;;
unsigned char *local;
char **pending_key;
@@ -1101,7 +1102,7 @@ int
afr_final_errno (afr_local_t *local, afr_private_t *priv);
int
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req);
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, gf_boolean_t checksum);
void
afr_fix_open (fd_t *fd, xlator_t *this);
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index d8c81900b6f..985490892f3 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -1320,6 +1320,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.description = "Use PGFID attribute if available to remediate "
"failed heals."
},
+ { .key = "cluster.shd-validate-data",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT,
+ .description = "Check data as well as flags for self-heal."
+ },
/* stripe xlator options */
{ .key = "cluster.stripe-block-size",
diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c
index da0bca6e41c..fb112d8172d 100644
--- a/xlators/storage/posix/src/posix-handle.c
+++ b/xlators/storage/posix/src/posix-handle.c
@@ -26,6 +26,8 @@
#include "compat-errno.h"
+#define GF_ORPHAN_PATH "orphans"
+
inode_t *
posix_resolve (xlator_t *this, inode_table_t *itable, inode_t *parent,
char *bname, struct iatt *iabuf)
@@ -515,7 +517,7 @@ posix_handle_init (xlator_t *this)
}
handle_pfx = alloca (priv->base_path_length + 1 + strlen (GF_HIDDEN_PATH)
- + 1);
+ + 1 + strlen (GF_ORPHAN_PATH) + 1);
sprintf (handle_pfx, "%s/%s", priv->base_path, GF_HIDDEN_PATH);
@@ -599,6 +601,16 @@ posix_handle_init (xlator_t *this)
break;
}
+ sprintf (handle_pfx, "%s/%s/%s", priv->base_path, GF_HIDDEN_PATH,
+ GF_ORPHAN_PATH);
+ if (mkdir (handle_pfx, 0700) < 0) {
+ if (errno != EEXIST) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to create orphans directory (%s)",
+ strerror (errno));
+ }
+ }
+
return 0;
}
diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c
index 6bd6bb1a0bf..5c7cca9fbac 100644
--- a/xlators/storage/posix/src/posix-helpers.c
+++ b/xlators/storage/posix/src/posix-helpers.c
@@ -18,6 +18,7 @@
#include <ftw.h>
#include <sys/stat.h>
#include <signal.h>
+#include <openssl/md5.h>
#ifdef HAVE_SYS_ACL_H
#ifdef HAVE_ACL_LIBACL_H /* for acl_to_any_text() */
@@ -239,7 +240,8 @@ _posix_xattr_get_set_from_backend (posix_xattr_filler_t *filler, char *key)
}
value[xattr_size] = '\0';
- ret = dict_set_bin (filler->xattr, key, value, xattr_size);
+ ret = dict_set_bin (filler->xattr, key,
+ value, xattr_size);
if (ret < 0) {
if (filler->real_path)
gf_msg_debug (filler->this->name, 0,
@@ -798,6 +800,45 @@ out:
return;
}
+static int32_t
+_handle_cksum (const char *path, uint32_t *result)
+{
+ int fd;
+ size_t bytes;
+ unsigned char buf[65536];
+ unsigned char cksum[MD5_DIGEST_LENGTH] = {0, };
+ MD5_CTX mdctx;
+
+ fd = open (path, O_RDONLY);
+ if (fd < 0) {
+ return -1;
+ }
+
+ MD5_Init (&mdctx);
+
+ for (;;) {
+ bytes = read (fd, buf, sizeof (buf));
+ if (bytes < 0) {
+ close (fd);
+ return -2;
+ }
+ if (bytes == 0) {
+ break;
+ }
+ MD5_Update (&mdctx, buf, bytes);
+
+ if (bytes < sizeof (buf)) {
+ break;
+ }
+ }
+
+ MD5_Final (cksum, &mdctx);
+
+ close (fd);
+ *result = *((uint32_t *)cksum);
+ return 0;
+}
+
dict_t *
posix_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, fd_t *fd,
int fdnum, dict_t *xattr_req, struct iatt *buf)
@@ -805,12 +846,19 @@ posix_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, fd_t *fd,
dict_t *xattr = NULL;
posix_xattr_filler_t filler = {0, };
gf_boolean_t list = _gf_false;
+ gf_boolean_t do_cksum = _gf_false;
+ uint32_t cksum_val;
if (dict_get (xattr_req, "list-xattr")) {
dict_del (xattr_req, "list-xattr");
list = _gf_true;
}
+ if (dict_get (xattr_req, "get-checksum")) {
+ dict_del (xattr_req, "get-checksum");
+ do_cksum = _gf_true;
+ }
+
xattr = dict_new ();
if (!xattr) {
goto out;
@@ -828,6 +876,20 @@ posix_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, fd_t *fd,
if (list)
_handle_list_xattr (xattr_req, real_path, fdnum, &filler);
+ if (do_cksum) {
+ if (_handle_cksum (real_path, &cksum_val) == 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "generated checksum %x for %s", cksum_val, real_path);
+ if (dict_set_uint32 (xattr, "checksum", cksum_val) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not store checksum for %s", real_path);
+ }
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not get checksum for %s", real_path);
+ }
+ }
+
out:
return xattr;
}
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index 32b961b680d..b461d4ea4a1 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -3903,6 +3903,173 @@ map_xattr_flags(int flags)
}
#endif
+static int
+posix_copy_file (char *src, char *dst)
+{
+ int ifd = -1;
+ int ofd = -1;
+ char buf[65536];
+ ssize_t ibytes;
+ ssize_t obytes;
+ int ret;
+
+ ifd = open (src, O_RDONLY);
+ if (ifd < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "opening source file %s failed (%s)",
+ src, strerror (ret));
+ goto done;
+ }
+
+ ofd = open (dst, O_WRONLY|O_CREAT, 0666);
+ if (ofd < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "opening destination file %s failed (%s)",
+ dst, strerror (ret));
+ goto done;
+ }
+
+ for (;;) {
+ ibytes = read (ifd, buf, sizeof (buf));
+ if (ibytes < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "reading source file %s failed (%s)",
+ src, strerror (ret));
+ goto done;
+ }
+ if (ibytes == 0) {
+ break;
+ }
+ obytes = write (ofd, buf, ibytes);
+ if (obytes < ibytes) {
+ ret = errno;
+ if (obytes < 0) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "writing destination file %s failed (%s)",
+ dst, strerror (ret));
+ } else {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "only wrote %zd/%zd bytes from %s to %s",
+ obytes, ibytes, src, dst);
+ }
+ goto done;
+ }
+ }
+
+ if (fsync (ofd) < 0) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "fsync failed copying %s (%s)",
+ dst, strerror (errno));
+ }
+ ret = 0;
+
+done:
+ if (ifd >= 0) {
+ close (ifd);
+ }
+ if (ofd >= 0) {
+ close (ofd);
+ }
+ return ret;
+}
+
+static char orphan_file_pattern[] = "%s/.glusterfs/orphans/%s.%s";
+static char orphan_link_pattern[] = "../..%s";
+
+static int32_t
+posix_move_aside (call_frame_t *frame, xlator_t *this, inode_t *inode)
+{
+ struct posix_private *priv = this->private;
+ char *rel_path = NULL;
+ size_t my_str_len = 1; /* Just the NUL */
+ char *src_path;
+ char *dst_path;
+ int32_t op_errno = 0;
+ struct iatt stbuf;
+ char *link_tgt;
+
+ if (inode_path (inode, NULL, &rel_path) <= 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not get move-aside path for %s",
+ uuid_utoa (inode->gfid));
+ op_errno = ENOENT;
+ goto done;
+ }
+
+ my_str_len = strlen (priv->base_path) + strlen (rel_path) + 1;
+ src_path = alloca (my_str_len);
+ sprintf (src_path, "%s%s", priv->base_path, rel_path);
+
+ op_errno = posix_pstat (this, NULL, src_path, &stbuf);
+ if (op_errno != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "move-aside stat failed for %s", src_path);
+ goto done;
+ }
+
+ /* 36 for a GFID, and 4 for the orig/link extension */
+ my_str_len = sizeof (orphan_file_pattern) + 40;
+ dst_path = alloca (my_str_len);
+ sprintf (dst_path, orphan_file_pattern, priv->base_path,
+ uuid_utoa (inode->gfid), "orig");
+
+ gf_log (this->name, GF_LOG_INFO,
+ "move-aside: src = %s, dst = %s", src_path, dst_path);
+
+#if defined(MOVING_THE_FILE_WORKS)
+ /*
+ * This is how we should really do things, to avoid the overhead of
+ * copying (potentially large) amounts of data. Unfortunately, if the
+ * file and all of its xattrs aren't there, the self-heal that's the
+ * whole point of our little exercise doesn't work. The same might be
+ * true of the .glusterfs handle. Until some magic formula can be
+ * found, our most expedient choice is to *copy* the file instead of
+ * moving it.
+ */
+ if (sys_link (src_path, dst_path) < 0) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "move-aside link failed for %s", src_path);
+ goto done;
+ }
+
+ if (posix_unlink_gfid_handle_and_entry (this, src_path, &stbuf,
+ &op_errno) < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "move-aside unlink failed for %s", src_path);
+ goto done;
+ }
+#else
+ op_errno = posix_copy_file (src_path, dst_path);
+ if (op_errno != 0) {
+ /* Errors would have been reported already. */
+ goto done;
+ }
+#endif
+
+ sprintf (dst_path, orphan_file_pattern, priv->base_path,
+ uuid_utoa (inode->gfid), "link");
+ my_str_len = sizeof (orphan_link_pattern) + strlen (rel_path);
+ link_tgt = alloca (my_str_len);
+ sprintf (link_tgt, orphan_link_pattern, rel_path);
+
+ if (sys_symlink (link_tgt, dst_path) < 0) {
+ /* This is deliberately not fatal. */
+ gf_log (this->name, GF_LOG_WARNING,
+ "move-aside could not link %s to %s",
+ dst_path, link_tgt);
+ }
+
+done:
+ if (rel_path) {
+ GF_FREE (rel_path);
+ }
+ return op_errno;
+}
+
int32_t
posix_setxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *dict, int flags, dict_t *xdata)
@@ -3948,6 +4115,17 @@ posix_setxattr (call_frame_t *frame, xlator_t *this,
#else
filler.flags = flags;
#endif
+
+ if (dict_get (dict, "trusted.move-aside")) {
+ dict_del (dict, "trusted.move-aside");
+ op_ret = posix_move_aside (frame, this, loc->inode);
+ if (op_ret != 0) {
+ op_errno = abs (op_ret);
+ op_ret = -1;
+ goto out;
+ }
+ }
+
op_ret = dict_foreach (dict, _handle_setxattr_keyvalue_pair,
&filler);
if (op_ret < 0) {