summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorkarthik-us <ksubrahm@redhat.com>2019-03-12 15:17:48 +0530
committerShyamsundar Ranganathan <srangana@redhat.com>2019-03-21 10:36:15 +0000
commit2a872bbcdedb6218e666ebcd1827d59a99912eb1 (patch)
tree158e51c6663c30acecf31ab204cbc33183a2db05
parentd2b8167aeb5f5559a16eef4604b46f71f3dbf894 (diff)
cluster/afr: Send truncate on arbiter brick from SHD
Problem: In an arbiter volume configuration SHD will not send any writes onto the arbiter brick even if there is data pending marker for the arbiter brick. If we have a arbiter setup on the geo-rep master and there are data pending markers for the files on arbiter brick, SHD will not mark any data changelog during healing. While syncing the data from master to slave, if the arbiter-brick is considered as ACTIVE, then there is a chance that slave will miss out some data. If the arbiter brick is being newly added or replaced there is a chance of slave missing all the data during sync. Fix: If there is data pending marker for the arbiter brick, send truncate on the arbiter brick during heal, so that it will record truncate as the data transaction in changelog. Change-Id: I3242ba6cea6da495c418ef860d9c3359c5459dec fixes: bz#1687746 Signed-off-by: karthik-us <ksubrahm@redhat.com>
-rw-r--r--tests/bugs/replicate/bug-1686568-send-truncate-on-arbiter-from-shd.t38
-rw-r--r--tests/utils/changelogparser.py234
-rw-r--r--tests/volume.rc7
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c26
4 files changed, 291 insertions, 14 deletions
diff --git a/tests/bugs/replicate/bug-1686568-send-truncate-on-arbiter-from-shd.t b/tests/bugs/replicate/bug-1686568-send-truncate-on-arbiter-from-shd.t
new file mode 100644
index 00000000000..78581e99614
--- /dev/null
+++ b/tests/bugs/replicate/bug-1686568-send-truncate-on-arbiter-from-shd.t
@@ -0,0 +1,38 @@
+#!/bin/bash
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+cleanup;
+
+CHANGELOG_PATH_0="$B0/${V0}2/.glusterfs/changelogs"
+ROLLOVER_TIME=100
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
+TEST $CLI volume set $V0 changelog.changelog on
+TEST $CLI volume set $V0 changelog.rollover-time $ROLLOVER_TIME
+TEST $CLI volume start $V0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
+
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
+TEST dd if=/dev/zero of=$M0/file1 bs=128K count=5
+
+TEST $CLI volume profile $V0 start
+TEST $CLI volume add-brick $V0 replica 3 arbiter 1 $H0:$B0/${V0}2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
+
+TEST $CLI volume heal $V0
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+TEST $CLI volume profile $V0 info
+truncate_count=$($CLI volume profile $V0 info | grep TRUNCATE | awk '{count += $8} END {print count}')
+
+EXPECT "1" echo $truncate_count
+EXPECT "1" check_changelog_op ${CHANGELOG_PATH_0} "^ D "
+
+cleanup;
diff --git a/tests/utils/changelogparser.py b/tests/utils/changelogparser.py
new file mode 100644
index 00000000000..e173e52cbe6
--- /dev/null
+++ b/tests/utils/changelogparser.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Why?
+
+Converts this
+
+GlusterFS Changelog | version: v1.1 | encoding : 2
+E0b99ef11-4b79-4cd0-9730-b5a0e8c4a8c0^@4^@16877^@0^@0^@00000000-0000-0000-0000-
+000000000001/dir1^@Ec5250af6-720e-4bfe-b938-827614304f39^@23^@33188^@0^@0^@0b99
+ef11-4b79-4cd0-9730-b5a0e8c4a8c0/hello.txt^@Dc5250af6-720e-4bfe-b938-827614304f
+39^@Dc5250af6-720e-4bfe-b938-827614304f39^@
+
+
+to human readable :)
+
+E 0b99ef11-4b79-4cd0-9730-b5a0e8c4a8c0 MKDIR 16877 0 000000000-0000-0000-0000
+ -000000000001/dir1
+E c5250af6-720e-4bfe-b938-827614304f39 CREATE 33188 0 0 0b99ef11-4b79-4cd0-9730
+ -b5a0e8c4a8c0/hello.txt
+D c5250af6-720e-4bfe-b938-827614304f39
+D c5250af6-720e-4bfe-b938-827614304f39
+
+
+"""
+import sys
+import codecs
+
+ENTRY = 'E'
+META = 'M'
+DATA = 'D'
+SEP = "\x00"
+
+GF_FOP = [
+ "NULL", "STAT", "READLINK", "MKNOD", "MKDIR", "UNLINK",
+ "RMDIR", "SYMLINK", "RENAME", "LINK", "TRUNCATE", "OPEN",
+ "READ", "WRITE", "STATFS", "FLUSH", "FSYNC", "SETXATTR",
+ "GETXATTR", "REMOVEXATTR", "OPENDIR", "FSYNCDIR", "ACCESS",
+ "CREATE", "FTRUNCATE", "FSTAT", "LK", "LOOKUP", "READDIR",
+ "INODELK", "FINODELK", "ENTRYLK", "FENTRYLK", "XATTROP",
+ "FXATTROP", "FSETXATTR", "FGETXATTR", "RCHECKSUM", "SETATTR",
+ "FSETATTR", "READDIRP", "GETSPEC", "FORGET", "RELEASE",
+ "RELEASEDIR", "FREMOVEXATTR", "FALLOCATE", "DISCARD", "ZEROFILL"]
+
+
+class NumTokens_V11(object):
+ E = 7
+ M = 3
+ D = 2
+ NULL = 3
+ MKNOD = 7
+ MKDIR = 7
+ UNLINK = 4
+ RMDIR = 4
+ SYMLINK = 4
+ RENAME = 5
+ LINK = 4
+ SETXATTR = 3
+ REMOVEXATTR = 3
+ CREATE = 7
+ SETATTR = 3
+ FTRUNCATE = 3
+ FXATTROP = 3
+
+
+class NumTokens_V12(NumTokens_V11):
+ UNLINK = 5
+ RMDIR = 5
+
+
+class Version:
+ V11 = "v1.1"
+ V12 = "v1.2"
+
+
+class Record(object):
+ def __init__(self, **kwargs):
+ self.ts = kwargs.get("ts", None)
+ self.fop_type = kwargs.get("fop_type", None)
+ self.gfid = kwargs.get("gfid", None)
+ self.path = kwargs.get("path", None)
+ self.fop = kwargs.get("fop", None)
+ self.path1 = kwargs.get("path1", None)
+ self.path2 = kwargs.get("path2", None)
+ self.mode = kwargs.get("mode", None)
+ self.uid = kwargs.get("uid", None)
+ self.gid = kwargs.get("gid", None)
+
+ def create_mknod_mkdir(self, **kwargs):
+ self.path = kwargs.get("path", None)
+ self.fop = kwargs.get("fop", None)
+ self.mode = kwargs.get("mode", None)
+ self.uid = kwargs.get("uid", None)
+ self.gid = kwargs.get("gid", None)
+
+ def metadata(self, **kwargs):
+ self.fop = kwargs.get("fop", None)
+
+ def rename(self, **kwargs):
+ self.fop = kwargs.get("fop", None)
+ self.path1 = kwargs.get("path1", None)
+ self.path2 = kwargs.get("path2", None)
+
+ def link_symlink_unlink_rmdir(self, **kwargs):
+ self.path = kwargs.get("path", None)
+ self.fop = kwargs.get("fop", None)
+
+ def __unicode__(self):
+ if self.fop_type == "D":
+ return u"{ts} {fop_type} {gfid}".format(**self.__dict__)
+ elif self.fop_type == "M":
+ return u"{ts} {fop_type} {gfid} {fop}".format(**self.__dict__)
+ elif self.fop_type == "E":
+ if self.fop in ["CREATE", "MKNOD", "MKDIR"]:
+ return (u"{ts} {fop_type} {gfid} {fop} "
+ u"{path} {mode} {uid} {gid}".format(**self.__dict__))
+ elif self.fop == "RENAME":
+ return (u"{ts} {fop_type} {gfid} {fop} "
+ u"{path1} {path2}".format(**self.__dict__))
+ elif self.fop in ["LINK", "SYMLINK", "UNLINK", "RMDIR"]:
+ return (u"{ts} {fop_type} {gfid} {fop} "
+ u"{path}".format(**self.__dict__))
+ else:
+ return repr(self.__dict__)
+ else:
+ return repr(self.__dict__)
+
+ def __str__(self):
+ return unicode(self).encode('utf-8')
+
+
+def get_num_tokens(data, tokens, version=Version.V11):
+ if version == Version.V11:
+ cls_numtokens = NumTokens_V11
+ elif version == Version.V12:
+ cls_numtokens = NumTokens_V12
+ else:
+ sys.stderr.write("Unknown Changelog Version\n")
+ sys.exit(1)
+
+ if data[tokens[0]] in [ENTRY, META]:
+ if len(tokens) >= 3:
+ return getattr(cls_numtokens, GF_FOP[int(data[tokens[2]])])
+ else:
+ return None
+ else:
+ return getattr(cls_numtokens, data[tokens[0]])
+
+
+def process_record(data, tokens, changelog_ts, callback):
+ if data[tokens[0]] in [ENTRY, META]:
+ try:
+ tokens[2] = GF_FOP[int(data[tokens[2]])]
+ except ValueError:
+ tokens[2] = "NULL"
+
+ if not changelog_ts:
+ ts1 = int(changelog_ts)
+ else:
+ ts1=""
+ record = Record(ts=ts1, fop_type=data[tokens[0]],
+ gfid=data[tokens[1]])
+ if data[tokens[0]] == META:
+ record.metadata(fop=tokens[2])
+ elif data[tokens[0]] == ENTRY:
+ if tokens[2] in ["CREATE", "MKNOD", "MKDIR"]:
+ record.create_mknod_mkdir(fop=tokens[2],
+ path=data[tokens[6]],
+ mode=int(data[tokens[3]]),
+ uid=int(data[tokens[4]]),
+ gid=int(data[tokens[5]]))
+ elif tokens[2] == "RENAME":
+ record.rename(fop=tokens[2],
+ path1=data[tokens[3]],
+ path2=data[tokens[4]])
+ if tokens[2] in ["LINK", "SYMLINK", "UNLINK", "RMDIR"]:
+ record.link_symlink_unlink_rmdir(fop=tokens[2],
+ path=data[tokens[3]])
+ callback(record)
+
+
+def default_callback(record):
+ sys.stdout.write(u"{0}\n".format(record))
+
+
+def parse(filename, callback=default_callback):
+ data = None
+ tokens = []
+ changelog_ts = filename.rsplit(".")[-1]
+ with codecs.open(filename, mode="rb", encoding="utf-8") as f:
+ # GlusterFS Changelog | version: v1.1 | encoding : 2
+ header = f.readline()
+ version = header.split()[4]
+
+ data = f.readline()
+
+ slice_start = 0
+ in_record = False
+
+ prev_char = ""
+ next_char = ""
+ for i, c in enumerate(data):
+ next_char = ""
+ if len(data) >= (i + 2):
+ next_char = data[i+1]
+
+ if not in_record and c in [ENTRY, META, DATA]:
+ tokens.append(slice(slice_start, i+1))
+ slice_start = i+1
+ in_record = True
+ continue
+
+ if c == SEP and ((prev_char != SEP and next_char == SEP) or
+ (prev_char == SEP and next_char != SEP) or
+ (prev_char != SEP and next_char != SEP)):
+ tokens.append(slice(slice_start, i))
+ slice_start = i+1
+
+ num_tokens = get_num_tokens(data, tokens, version)
+
+ if num_tokens == len(tokens):
+ process_record(data, tokens, changelog_ts, callback)
+ in_record = False
+ tokens = []
+
+ prev_char = c
+
+ # process last record
+ if slice_start < (len(data) - 1):
+ tokens.append(slice(slice_start, len(data)))
+ process_record(data, tokens, changelog_ts, callback)
+ tokens = []
+
+parse(sys.argv[1])
diff --git a/tests/volume.rc b/tests/volume.rc
index 44428606711..8f2cb39aeb1 100644
--- a/tests/volume.rc
+++ b/tests/volume.rc
@@ -842,3 +842,10 @@ function get_mount_lru_size_value {
rm -f $statedump
echo $val
}
+
+function check_changelog_op {
+ local clog_path=$1
+ local op=$2
+
+ $PYTHON $(dirname $0)/../../utils/changelogparser.py ${clog_path}/CHANGELOG | grep "$op" | wc -l
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 2ac6e475bb2..f468fc8ad64 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -399,17 +399,18 @@ __afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this,
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- unsigned char arbiter_sink_status = 0;
int i = 0;
local = frame->local;
priv = this->private;
- if (priv->arbiter_count) {
- arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX];
- healed_sinks[ARBITER_BRICK_INDEX] = 0;
- }
-
+ /* This will send truncate on the arbiter brick as well if it is marked
+ * as sink. If changelog is enabled on the volume it captures truncate
+ * as a data transactions on the arbiter brick. This will help geo-rep
+ * to properly sync the data from master to slave if arbiter is the
+ * ACTIVE brick during syncing and which had got some entries healed for
+ * data as part of self heal.
+ */
AFR_ONLIST (healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd,
size, NULL);
@@ -420,8 +421,6 @@ __afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this,
*/
healed_sinks[i] = 0;
- if (arbiter_sink_status)
- healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status;
return 0;
}
@@ -733,18 +732,17 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
goto unlock;
}
+ ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks,
+ locked_replies[source].poststat.ia_size);
+ if (ret < 0)
+ goto unlock;
+
if (priv->arbiter_count &&
AFR_COUNT (healed_sinks, priv->child_count) == 1 &&
healed_sinks[ARBITER_BRICK_INDEX]) {
is_arbiter_the_only_sink = _gf_true;
goto restore_time;
}
-
- ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks,
- locked_replies[source].poststat.ia_size);
- if (ret < 0)
- goto unlock;
-
ret = 0;
}