summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tests/xfs-dio.t118
-rw-r--r--xlators/cluster/afr/src/afr-common.c22
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c82
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c141
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c2
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c2
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c3
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h7
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c42
-rw-r--r--xlators/cluster/afr/src/afr.c9
-rw-r--r--xlators/cluster/afr/src/afr.h3
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c6
-rw-r--r--xlators/storage/posix/src/posix-handle.c14
-rw-r--r--xlators/storage/posix/src/posix-helpers.c64
-rw-r--r--xlators/storage/posix/src/posix.c178
15 files changed, 669 insertions, 24 deletions
diff --git a/tests/xfs-dio.t b/tests/xfs-dio.t
new file mode 100644
index 00000000000..c8cabd5766b
--- /dev/null
+++ b/tests/xfs-dio.t
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+. $(dirname $0)/include.rc
+. $(dirname $0)/volume.rc
+
+write_data () {
+ path=$1
+ shift
+ echo "$@" > $path
+}
+
+create_index_entry () {
+ local brick=$1
+ local gfid_str=$(gf_get_gfid_xattr $brick/$2)
+ local gfid_path=$(gf_gfid_xattr_to_str $gfid_str)
+ local xop_file=$(ls $brick/.glusterfs/indices/xattrop/xattrop-* \
+ | tail -n1)
+ ln $xop_file $brick/.glusterfs/indices/xattrop/$gfid_path
+ setfattr -n trusted.glusterfs.validate-status -v suspect $brick/$2
+}
+
+get_vstatus () {
+ getfattr --name trusted.glusterfs.validate-status --only-values $1 \
+ 2> /dev/null
+}
+
+trap cleanup EXIT
+
+TEST glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+
+# Comment out the following line to see the "argh" and "blah" tests at the end
+# fail. That's because normal self-heal can't deal with this particular
+# condition. To do that, we must check the actual data (OK, checksums). That's
+# expensive, but if there's corruption below us - e.g. filesystem bug, flaky
+# disk - then it's what we have to do.
+TEST $CLI volume set $V0 cluster.shd-validate-data on
+
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+TEST $GFS -s $H0 --volfile-id=$V0 $M0
+TEST mkdir $M0/xyz
+TEST write_data $M0/file-ok hello
+TEST write_data $M0/file-use0 hello
+TEST write_data $M0/xyz/file-use1 hello
+TEST write_data $M0/file-bad hello
+TEST umount $M0
+
+# Corrupt a bunch of data.
+TEST write_data $B0/${V0}2/file-use0 'argh!'
+TEST write_data $B0/${V0}0/xyz/file-use1 'blah!'
+TEST write_data $B0/${V0}1/file-bad 'diffX'
+TEST write_data $B0/${V0}2/file-bad 'diffY'
+
+# Add the files to their indices.
+TEST create_index_entry $B0/${V0}0 file-ok
+TEST create_index_entry $B0/${V0}1 file-ok
+TEST create_index_entry $B0/${V0}2 file-ok
+TEST create_index_entry $B0/${V0}0 file-use0
+TEST create_index_entry $B0/${V0}1 file-use0
+TEST create_index_entry $B0/${V0}2 file-use0
+TEST create_index_entry $B0/${V0}0 xyz/file-use1
+TEST create_index_entry $B0/${V0}1 xyz/file-use1
+TEST create_index_entry $B0/${V0}2 xyz/file-use1
+TEST create_index_entry $B0/${V0}0 file-bad
+TEST create_index_entry $B0/${V0}1 file-bad
+TEST create_index_entry $B0/${V0}2 file-bad
+
+# Time to see what we can do.
+TEST $CLI volume heal $V0
+
+# These files are not marked in the normal way as needing heal (that's kind of
+# the whole problem) so heal counts aren't useful. There are only a few tiny
+# files, so just wait a few seconds for the heal to complete.
+sleep 5
+
+# Test the contents of the files.
+EXPECT hello cat $B0/${V0}0/file-ok
+EXPECT hello cat $B0/${V0}1/file-ok
+EXPECT hello cat $B0/${V0}2/file-ok
+EXPECT hello cat $B0/${V0}0/file-use0
+EXPECT hello cat $B0/${V0}1/file-use0
+EXPECT hello cat $B0/${V0}2/file-use0
+EXPECT hello cat $B0/${V0}0/xyz/file-use1
+EXPECT hello cat $B0/${V0}1/xyz/file-use1
+EXPECT hello cat $B0/${V0}2/xyz/file-use1
+# This was in three-way split brain, so the replicas should still diverge.
+EXPECT hello cat $B0/${V0}0/file-bad
+EXPECT diffX cat $B0/${V0}1/file-bad
+EXPECT diffY cat $B0/${V0}2/file-bad
+
+# Now test validation states.
+EXPECT clean get_vstatus $B0/${V0}0/file-ok
+EXPECT clean get_vstatus $B0/${V0}1/file-ok
+EXPECT clean get_vstatus $B0/${V0}2/file-ok
+EXPECT clean get_vstatus $B0/${V0}0/file-use0
+EXPECT clean get_vstatus $B0/${V0}1/file-use0
+EXPECT repaired get_vstatus $B0/${V0}2/file-use0
+EXPECT repaired get_vstatus $B0/${V0}0/xyz/file-use1
+EXPECT clean get_vstatus $B0/${V0}1/xyz/file-use1
+EXPECT clean get_vstatus $B0/${V0}2/xyz/file-use1
+EXPECT suspect get_vstatus $B0/${V0}0/file-bad
+EXPECT suspect get_vstatus $B0/${V0}1/file-bad
+EXPECT suspect get_vstatus $B0/${V0}2/file-bad
+
+print_summary () {
+ for f in file-ok file-bad file-use0 file-use1; do
+ echo "=== FILE $f"
+ find $B0/ -name $f | xargs grep -E .
+ find $B0/ -name $f | xargs getfattr -d -e text \
+ -m trusted.glusterfs.validate-status
+ done
+ echo "=== ORPHANS"
+ find $B0 -name '*.orig' | xargs grep -E .
+ find $B0 -name '*.link' | xargs ls -l
+}
+
+#print_summary
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 7425b6688f2..f5210bd7d5d 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -1267,7 +1267,7 @@ afr_inode_refresh_do (call_frame_t *frame, xlator_t *this)
return 0;
}
- ret = afr_xattr_req_prepare (this, xdata);
+ ret = afr_xattr_req_prepare (this, xdata, _gf_false);
if (ret != 0) {
dict_unref (xdata);
afr_inode_refresh_done (frame, this, -ret);
@@ -1360,7 +1360,7 @@ afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
int
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req)
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, gf_boolean_t checksum)
{
int i = 0;
afr_private_t *priv = NULL;
@@ -1397,6 +1397,19 @@ afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req)
"Unable to set ancestry path key in dict ");
}
+ if (checksum) {
+ ret = dict_set_int32 (xattr_req, "get-checksum", 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to set get-checksum in dict ");
+ }
+ ret = dict_set_int32 (xattr_req, "trusted.glusterfs.validate-status", 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to set validate-status in dict ");
+ }
+ }
+
return ret;
}
@@ -1415,7 +1428,7 @@ afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this,
if (xattr_req && (xattr_req != local->xattr_req))
dict_copy (xattr_req, local->xattr_req);
- ret = afr_xattr_req_prepare (this, local->xattr_req);
+ ret = afr_xattr_req_prepare (this, local->xattr_req, _gf_false);
ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
if (ret < 0) {
@@ -5906,7 +5919,8 @@ afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode,
replies = alloca0 (sizeof (*replies) * priv->child_count);
- ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies);
+ ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies,
+ priv->shd_validate_data);
if (ret)
goto out;
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 800cf9705c9..5a9ab795a94 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -298,6 +298,51 @@ afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
return 0;
}
+static int
+afr_selfheal_vstatus_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+
+ syncbarrier_wake (&local->barrier);
+ return 0;
+}
+
+void
+afr_selfheal_update_vstatus (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *targets, char *new_status)
+{
+ loc_t loc = {0, };
+ dict_t *xattr;
+
+ loc.inode = inode_ref (inode);
+
+ xattr = dict_new ();
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unable to allocate validate-status for %s",
+ uuid_utoa (inode->gfid));
+ goto done;
+ }
+
+ if (dict_set_str (xattr, "trusted.glusterfs.validate-status",
+ new_status) != 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "couldn't clear validate-status for %s",
+ uuid_utoa (inode->gfid));
+ goto done;
+ }
+
+ AFR_ONLIST (targets, frame, afr_selfheal_vstatus_cbk, setxattr,
+ &loc, xattr, 0, NULL);
+
+done:
+ if (xattr) {
+ dict_unref (xattr);
+ }
+ loc_wipe (&loc);
+}
+
void
afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count)
@@ -1379,7 +1424,7 @@ afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
if (xattr)
dict_copy (xattr, xattr_req);
- if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
+ if (afr_xattr_req_prepare (frame->this, xattr_req, _gf_false) != 0) {
dict_destroy (xattr_req);
return NULL;
}
@@ -1406,10 +1451,11 @@ afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
return inode;
}
-int
+static int
afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode,
- uuid_t gfid, struct afr_reply *replies,
- unsigned char *discover_on)
+ uuid_t gfid, struct afr_reply *replies,
+ unsigned char *discover_on,
+ gf_boolean_t checksum)
{
loc_t loc = {0, };
dict_t *xattr_req = NULL;
@@ -1423,7 +1469,7 @@ afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode,
if (!xattr_req)
return -ENOMEM;
- if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
+ if (afr_xattr_req_prepare (frame->this, xattr_req, checksum) != 0) {
dict_destroy (xattr_req);
return -ENOMEM;
}
@@ -1444,14 +1490,15 @@ afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode,
int
afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
- uuid_t gfid, struct afr_reply *replies)
+ uuid_t gfid, struct afr_reply *replies,
+ gf_boolean_t checksum)
{
afr_private_t *priv = NULL;
priv = frame->this->private;
return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies,
- priv->child_up);
+ priv->child_up, checksum);
}
unsigned int
@@ -1865,7 +1912,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
replies = alloca0 (sizeof (*replies) * priv->child_count);
- ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies);
+ ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies, _gf_false);
if (ret)
goto out;
@@ -1983,6 +2030,22 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
if (data_selfheal)
*data_selfheal = _gf_true;
}
+
+ if (priv->shd_validate_data && data_selfheal && !*data_selfheal) {
+ if (IA_ISREG (replies[i].poststat.ia_type)) {
+ gf_log (this->name, GF_LOG_INFO,
+ "forcing data self-heal on %s",
+ uuid_utoa (replies[i].poststat.ia_gfid));
+ /*
+ * This will force our caller (e.g.
+ * afr_selfheal_do) to call afr_selfheal_data,
+ * even though it might otherwise think
+ * everything looks OK. From there, we'll do a
+ * more thorough inspection including checksums.
+ */
+ *data_selfheal = _gf_true;
+ }
+ }
}
if (valid_cnt > 0 && link_inode) {
@@ -1999,7 +2062,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
ret = 0;
out:
if (inode)
- inode_unref (inode);
+ inode_unref (inode);
if (replies)
afr_replies_wipe (replies, priv->child_count);
@@ -2140,7 +2203,6 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
&data_selfheal,
&metadata_selfheal,
&entry_selfheal);
-
if (ret)
goto out;
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index c1e945bfd82..894c8e68f25 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -587,6 +587,135 @@ out:
return source;
}
+static int
+afr_move_aside (call_frame_t *frame, xlator_t *this, inode_t *inode, int i)
+{
+ afr_private_t *priv = this->private;
+ dict_t *xattr = NULL;
+ int ret = -1;
+ loc_t loc = {0, };
+
+ loc.inode = inode_ref (inode);
+
+ xattr = dict_new ();
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to alloc move-aside dict for %s on child %d",
+ uuid_utoa (inode->gfid), i);
+ goto done;
+ }
+
+ if (dict_set_str (xattr, "trusted.move-aside", "please") != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set move-aside xattr for %s on child %d",
+ uuid_utoa (inode->gfid), i);
+ goto done;
+ }
+
+ if (syncop_setxattr (priv->children[i], &loc, xattr, 0,
+ NULL, NULL) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to send move-aside fop for %s on child %d",
+ uuid_utoa (inode->gfid), i);
+ goto done;
+ }
+
+ ret = 0;
+
+done:
+ if (xattr) {
+ dict_unref (xattr);
+ }
+ loc_wipe (&loc);
+
+ return ret;
+}
+
+static void
+afr_handle_validation (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ struct afr_reply *replies)
+{
+ afr_private_t *priv = this->private;
+ uint32_t *values;
+ int i;
+ int same_as[2] = {0, 0};
+ char *vstatus;
+
+ if (!priv->shd_validate_data) {
+ return;
+ }
+
+ values = alloca0 (sizeof (*values) * priv->child_count);
+ for (i = 0; i < priv->child_count; ++i) {
+ if (!replies[i].xdata) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no xdata for child %d", i);
+ return;
+ }
+ if (dict_get_str (replies[i].xdata,
+ "trusted.glusterfs.validate-status",
+ &vstatus) != 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no validate-status for child %d", i);
+ return;
+ }
+ if (strncmp (vstatus, "suspect", 7) != 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "validate-status = %s for child %d", vstatus, i);
+ return;
+ }
+ if (dict_get_uint32 (replies[i].xdata, "checksum", &values[i]) != 0) {
+ return;
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "checksum for child %d is 0x%x", i, values[i]);
+ }
+
+ /*
+ * Let's take a shortcut here by looking only for a single odd
+ * man out instead of a more generalized minority. To do this,
+ * we only need to compare the third item onward to (at most)
+ * the first two, and we only need two counters. There's all
+ * sorts of ways we could optimize this implementation, but
+ * there's little left to be saved.
+ */
+ for (i = 0; i < priv->child_count; ++i) {
+ same_as[0] += (values[i] == values[0]);
+ same_as[1] += (values[i] == values[1]);
+ }
+ if (same_as[0] == priv->child_count) {
+ gf_log (this->name, GF_LOG_DEBUG, "everything's OK");
+ afr_selfheal_update_vstatus (frame, this, inode,
+ sources, "clean");
+ } else if (same_as[0] == (priv->child_count - 1)) {
+ gf_log (this->name, GF_LOG_DEBUG, "odd man out, use 0");
+ for (i = 0; i < priv->child_count; ++i) {
+ if (values[i] != values[0]) {
+ sources[i] = 0;
+ sinks[i] = 1;
+ afr_move_aside (frame, this, inode, i);
+ }
+ }
+ } else if (same_as[1] == (priv->child_count - 1)) {
+ gf_log (this->name, GF_LOG_DEBUG, "odd man out, use 1");
+ for (i = 0; i < priv->child_count; ++i) {
+ if (values[i] != values[1]) {
+ sources[i] = 0;
+ sinks[i] = 1;
+ afr_move_aside (frame, this, inode, i);
+ }
+ }
+ } else {
+ gf_log (this->name, GF_LOG_WARNING, "three-way split on %s",
+ uuid_utoa (inode->gfid));
+ for (i = 0; i < priv->child_count; ++i) {
+ sources[i] = 0;
+ sinks[i] = 1;
+ }
+ }
+}
+
/*
* __afr_selfheal_data_prepare:
*
@@ -612,7 +741,7 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
priv = this->private;
ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
- replies);
+ replies, priv->shd_validate_data);
if (ret)
return ret;
@@ -625,6 +754,8 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
if (ret)
return ret;
+ afr_handle_validation (frame, this, inode, sources, sinks, replies);
+
/* Initialize the healed_sinks[] array optimistically to
the intersection of to-be-healed (i.e sinks[]) and
the list of servers which are up (i.e locked_on[]).
@@ -749,6 +880,14 @@ restore_time:
sources, sinks, healed_sinks,
undid_pending, AFR_DATA_TRANSACTION,
locked_replies, data_lock);
+
+ if (priv->shd_validate_data) {
+ afr_selfheal_update_vstatus (frame, this, fd->inode,
+ healed_sinks, "repaired");
+ afr_selfheal_update_vstatus (frame, this, fd->inode,
+ sources, "clean");
+ }
+
skip_undo_pending:
afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0,
data_lock);
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index 25f8ea313aa..e0a82426a33 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -489,7 +489,7 @@ __afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this,
priv = this->private;
ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
- replies);
+ replies, _gf_false);
if (ret)
return ret;
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 4570ace7ef7..85dbdf2976e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -336,7 +336,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i
priv = this->private;
ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
- replies);
+ replies, _gf_false);
if (ret)
return ret;
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
index 8e5546a702f..db78ef81804 100644
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -796,7 +796,8 @@ __afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *paren
replies = alloca0 (priv->child_count * sizeof(*replies));
- ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies);
+ ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies,
+ _gf_false);
if (ret)
goto out;
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 0a3d6482ca3..b33db59b50f 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -143,7 +143,8 @@ afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
int
afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
- uuid_t gfid, struct afr_reply *replies);
+ uuid_t gfid, struct afr_reply *replies,
+ gf_boolean_t checksum);
inode_t *
afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
@@ -165,6 +166,10 @@ int
afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
afr_transaction_type type, int *dirty, int **matrix);
+void
+afr_selfheal_update_vstatus (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *targets, char *new_status);
+
int
afr_sh_generic_fop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *pre,
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index 26f4a80777f..9c4f3ec81f6 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -315,8 +315,8 @@ afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent,
return ret;
}
-int
-afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)
+static int
+_afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)
{
int ret = 0;
eh_t *eh = NULL;
@@ -377,6 +377,44 @@ out:
}
+int
+afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)
+{
+ afr_private_t *priv = healer->this->private;
+ int ret = _afr_shd_selfheal (healer, child, gfid);
+
+ /*
+ * You are not expected to understand this code. OK, sorry, it's a
+ * very old UNIX meme. I've been waiting years for an appropriate time
+ * to use it, and this seems as good as it's going to get. If it makes
+ * you feel any better, the reason I don't expect you to understand
+ * this code is that I don't understand it either and therefore can't
+ * explain it.
+ *
+ * What's going on here is that we only call afr_shd_zero_xattrop for a
+ * return value of two, which non-obviously means that no heal was
+ * deemed necessary. However, we made it seem necessary *only* because
+ * of data validation, so we skipped the part where we'd return that
+ * value normally. It was only later, and several layers deeper in the
+ * call hierarchy, that we realized everything was OK after all.
+ * Expecting to return a two at that point, and have it survive all the
+ * intervening layers, and not have any other untoward side effects,
+ * would require more optimism about this code than I've ever felt.
+ * Changing it here isn't entirely without risk either, but at least
+ * the side effects this way are easier to reason about.
+ *
+ * You might well wonder how the index entry ever gets removed in the
+ * other cases. I wonder too. Observation says that it does, and
+ * that's good enough. It's a big world, with many other mysteries in
+ * it.
+ */
+ if (priv->shd_validate_data && (ret >= 0)) {
+ ret = 2;
+ }
+
+ return ret;
+}
+
void
afr_shd_sweep_prepare (struct subvol_healer *healer)
{
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 86f667116af..f291626fff9 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -167,6 +167,9 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("pgfid-self-heal", priv->pgfid_self_heal,
options, bool, out);
+ GF_OPTION_RECONF ("shd-validate-data", priv->shd_validate_data,
+ options, bool, out);
+
GF_OPTION_RECONF ("data-self-heal-window-size",
priv->data_self_heal_window_size, options,
uint32, out);
@@ -426,6 +429,8 @@ init (xlator_t *this)
GF_OPTION_INIT ("pgfid-self-heal", priv->pgfid_self_heal, bool, out);
+ GF_OPTION_INIT ("shd-validate-data", priv->shd_validate_data, bool, out);
+
GF_OPTION_INIT ("background-self-heal-count",
priv->background_self_heal_count, uint32, out);
@@ -1112,5 +1117,9 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
},
+ { .key = {"shd-validate-data"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index b60822d0ca9..3314f865781 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -101,6 +101,7 @@ typedef struct _afr_private {
unsigned char *child_up;
int64_t *child_latency;
gf_boolean_t pgfid_self_heal;
+ gf_boolean_t shd_validate_data;;
unsigned char *local;
char **pending_key;
@@ -1101,7 +1102,7 @@ int
afr_final_errno (afr_local_t *local, afr_private_t *priv);
int
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req);
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, gf_boolean_t checksum);
void
afr_fix_open (fd_t *fd, xlator_t *this);
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index d8c81900b6f..985490892f3 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -1320,6 +1320,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.description = "Use PGFID attribute if available to remediate "
"failed heals."
},
+ { .key = "cluster.shd-validate-data",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT,
+ .description = "Check data as well as flags for self-heal."
+ },
/* stripe xlator options */
{ .key = "cluster.stripe-block-size",
diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c
index da0bca6e41c..fb112d8172d 100644
--- a/xlators/storage/posix/src/posix-handle.c
+++ b/xlators/storage/posix/src/posix-handle.c
@@ -26,6 +26,8 @@
#include "compat-errno.h"
+#define GF_ORPHAN_PATH "orphans"
+
inode_t *
posix_resolve (xlator_t *this, inode_table_t *itable, inode_t *parent,
char *bname, struct iatt *iabuf)
@@ -515,7 +517,7 @@ posix_handle_init (xlator_t *this)
}
handle_pfx = alloca (priv->base_path_length + 1 + strlen (GF_HIDDEN_PATH)
- + 1);
+ + 1 + strlen (GF_ORPHAN_PATH) + 1);
sprintf (handle_pfx, "%s/%s", priv->base_path, GF_HIDDEN_PATH);
@@ -599,6 +601,16 @@ posix_handle_init (xlator_t *this)
break;
}
+ sprintf (handle_pfx, "%s/%s/%s", priv->base_path, GF_HIDDEN_PATH,
+ GF_ORPHAN_PATH);
+ if (mkdir (handle_pfx, 0700) < 0) {
+ if (errno != EEXIST) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to create orphans directory (%s)",
+ strerror (errno));
+ }
+ }
+
return 0;
}
diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c
index 6bd6bb1a0bf..5c7cca9fbac 100644
--- a/xlators/storage/posix/src/posix-helpers.c
+++ b/xlators/storage/posix/src/posix-helpers.c
@@ -18,6 +18,7 @@
#include <ftw.h>
#include <sys/stat.h>
#include <signal.h>
+#include <openssl/md5.h>
#ifdef HAVE_SYS_ACL_H
#ifdef HAVE_ACL_LIBACL_H /* for acl_to_any_text() */
@@ -239,7 +240,8 @@ _posix_xattr_get_set_from_backend (posix_xattr_filler_t *filler, char *key)
}
value[xattr_size] = '\0';
- ret = dict_set_bin (filler->xattr, key, value, xattr_size);
+ ret = dict_set_bin (filler->xattr, key,
+ value, xattr_size);
if (ret < 0) {
if (filler->real_path)
gf_msg_debug (filler->this->name, 0,
@@ -798,6 +800,45 @@ out:
return;
}
+static int32_t
+_handle_cksum (const char *path, uint32_t *result)
+{
+ int fd;
+ size_t bytes;
+ unsigned char buf[65536];
+ unsigned char cksum[MD5_DIGEST_LENGTH] = {0, };
+ MD5_CTX mdctx;
+
+ fd = open (path, O_RDONLY);
+ if (fd < 0) {
+ return -1;
+ }
+
+ MD5_Init (&mdctx);
+
+ for (;;) {
+ bytes = read (fd, buf, sizeof (buf));
+ if (bytes < 0) {
+ close (fd);
+ return -2;
+ }
+ if (bytes == 0) {
+ break;
+ }
+ MD5_Update (&mdctx, buf, bytes);
+
+ if (bytes < sizeof (buf)) {
+ break;
+ }
+ }
+
+ MD5_Final (cksum, &mdctx);
+
+ close (fd);
+ *result = *((uint32_t *)cksum);
+ return 0;
+}
+
dict_t *
posix_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, fd_t *fd,
int fdnum, dict_t *xattr_req, struct iatt *buf)
@@ -805,12 +846,19 @@ posix_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, fd_t *fd,
dict_t *xattr = NULL;
posix_xattr_filler_t filler = {0, };
gf_boolean_t list = _gf_false;
+ gf_boolean_t do_cksum = _gf_false;
+ uint32_t cksum_val;
if (dict_get (xattr_req, "list-xattr")) {
dict_del (xattr_req, "list-xattr");
list = _gf_true;
}
+ if (dict_get (xattr_req, "get-checksum")) {
+ dict_del (xattr_req, "get-checksum");
+ do_cksum = _gf_true;
+ }
+
xattr = dict_new ();
if (!xattr) {
goto out;
@@ -828,6 +876,20 @@ posix_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, fd_t *fd,
if (list)
_handle_list_xattr (xattr_req, real_path, fdnum, &filler);
+ if (do_cksum) {
+ if (_handle_cksum (real_path, &cksum_val) == 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "generated checksum %x for %s", cksum_val, real_path);
+ if (dict_set_uint32 (xattr, "checksum", cksum_val) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not store checksum for %s", real_path);
+ }
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not get checksum for %s", real_path);
+ }
+ }
+
out:
return xattr;
}
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index 32b961b680d..b461d4ea4a1 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -3903,6 +3903,173 @@ map_xattr_flags(int flags)
}
#endif
+static int
+posix_copy_file (char *src, char *dst)
+{
+ int ifd = -1;
+ int ofd = -1;
+ char buf[65536];
+ ssize_t ibytes;
+ ssize_t obytes;
+ int ret;
+
+ ifd = open (src, O_RDONLY);
+ if (ifd < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "opening source file %s failed (%s)",
+ src, strerror (ret));
+ goto done;
+ }
+
+ ofd = open (dst, O_WRONLY|O_CREAT, 0666);
+ if (ofd < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "opening destination file %s failed (%s)",
+ dst, strerror (ret));
+ goto done;
+ }
+
+ for (;;) {
+ ibytes = read (ifd, buf, sizeof (buf));
+ if (ibytes < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "reading source file %s failed (%s)",
+ src, strerror (ret));
+ goto done;
+ }
+ if (ibytes == 0) {
+ break;
+ }
+ obytes = write (ofd, buf, ibytes);
+ if (obytes < ibytes) {
+ ret = errno;
+ if (obytes < 0) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "writing destination file %s failed (%s)",
+ dst, strerror (ret));
+ } else {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "only wrote %zd/%zd bytes from %s to %s",
+ obytes, ibytes, src, dst);
+ }
+ goto done;
+ }
+ }
+
+ if (fsync (ofd) < 0) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "fsync failed copying %s (%s)",
+ dst, strerror (errno));
+ }
+ ret = 0;
+
+done:
+ if (ifd >= 0) {
+ close (ifd);
+ }
+ if (ofd >= 0) {
+ close (ofd);
+ }
+ return ret;
+}
+
+static char orphan_file_pattern[] = "%s/.glusterfs/orphans/%s.%s";
+static char orphan_link_pattern[] = "../..%s";
+
+static int32_t
+posix_move_aside (call_frame_t *frame, xlator_t *this, inode_t *inode)
+{
+ struct posix_private *priv = this->private;
+ char *rel_path = NULL;
+ size_t my_str_len = 1; /* Just the NUL */
+ char *src_path;
+ char *dst_path;
+ int32_t op_errno = 0;
+ struct iatt stbuf;
+ char *link_tgt;
+
+ if (inode_path (inode, NULL, &rel_path) <= 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not get move-aside path for %s",
+ uuid_utoa (inode->gfid));
+ op_errno = ENOENT;
+ goto done;
+ }
+
+ my_str_len = strlen (priv->base_path) + strlen (rel_path) + 1;
+ src_path = alloca (my_str_len);
+ sprintf (src_path, "%s%s", priv->base_path, rel_path);
+
+ op_errno = posix_pstat (this, NULL, src_path, &stbuf);
+ if (op_errno != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "move-aside stat failed for %s", src_path);
+ goto done;
+ }
+
+ /* 36 for a GFID, and 4 for the orig/link extension */
+ my_str_len = sizeof (orphan_file_pattern) + 40;
+ dst_path = alloca (my_str_len);
+ sprintf (dst_path, orphan_file_pattern, priv->base_path,
+ uuid_utoa (inode->gfid), "orig");
+
+ gf_log (this->name, GF_LOG_INFO,
+ "move-aside: src = %s, dst = %s", src_path, dst_path);
+
+#if defined(MOVING_THE_FILE_WORKS)
+ /*
+ * This is how we should really do things, to avoid the overhead of
+ * copying (potentially large) amounts of data. Unfortunately, if the
+ * file and all of its xattrs aren't there, the self-heal that's the
+ * whole point of our little exercise doesn't work. The same might be
+ * true of the .glusterfs handle. Until some magic formula can be
+ * found, our most expedient choice is to *copy* the file instead of
+ * moving it.
+ */
+ if (sys_link (src_path, dst_path) < 0) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "move-aside link failed for %s", src_path);
+ goto done;
+ }
+
+ if (posix_unlink_gfid_handle_and_entry (this, src_path, &stbuf,
+ &op_errno) < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "move-aside unlink failed for %s", src_path);
+ goto done;
+ }
+#else
+ op_errno = posix_copy_file (src_path, dst_path);
+ if (op_errno != 0) {
+ /* Errors would have been reported already. */
+ goto done;
+ }
+#endif
+
+ sprintf (dst_path, orphan_file_pattern, priv->base_path,
+ uuid_utoa (inode->gfid), "link");
+ my_str_len = sizeof (orphan_link_pattern) + strlen (rel_path);
+ link_tgt = alloca (my_str_len);
+ sprintf (link_tgt, orphan_link_pattern, rel_path);
+
+ if (sys_symlink (link_tgt, dst_path) < 0) {
+ /* This is deliberately not fatal. */
+ gf_log (this->name, GF_LOG_WARNING,
+ "move-aside could not link %s to %s",
+ dst_path, link_tgt);
+ }
+
+done:
+ if (rel_path) {
+ GF_FREE (rel_path);
+ }
+ return op_errno;
+}
+
int32_t
posix_setxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *dict, int flags, dict_t *xdata)
@@ -3948,6 +4115,17 @@ posix_setxattr (call_frame_t *frame, xlator_t *this,
#else
filler.flags = flags;
#endif
+
+ if (dict_get (dict, "trusted.move-aside")) {
+ dict_del (dict, "trusted.move-aside");
+ op_ret = posix_move_aside (frame, this, loc->inode);
+ if (op_ret != 0) {
+ op_errno = abs (op_ret);
+ op_ret = -1;
+ goto out;
+ }
+ }
+
op_ret = dict_foreach (dict, _handle_setxattr_keyvalue_pair,
&filler);
if (op_ret < 0) {