summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNithya Balachandran <nbalacha@redhat.com>2015-04-27 21:18:10 +0530
committerVijay Bellur <vbellur@redhat.com>2015-05-08 04:49:22 -0700
commitbb6de6aaf5ddbc93eca63b983c0bd2ead8969750 (patch)
treee374a5f927571db3f91eff4e3f1f87dc01f3b67b
parentbd6474eb64fd753bb636f45fd6a099df3b619152 (diff)
geo-rep: rename handling in dht volume
Background: Glusterfs changelogs are stored in each brick, which records the changes happened in that brick. Georep will run in all the nodes of master and processes changelogs "independently". Processing changelogs is in brick level, but all the fops will be replayed on "slave mount" point. Problem: With a DHT volume, in changelog "internal fops" are NOT recorded. For Rename case, Rename is recorded in "hashed" brick changelog. (DHT's internal fops like creating linkto file, unlink is NOT recorded). This lead us to inconsistent rename operations. For example, Distribute volume created with Two bricks B1, B2. //Consider master volume mounted @ /mnt/master and following operations executed: cd /mnt/master touch f1 // f1 falls on B1 Hash mv f1 f2 // f2 falls on B2 Hash // Here, Changelogs are recorded as below: @B1 CREATE f1 @B2 RENAME f1 f2 Here, race exists between Brick B1 and B2, say B2 will get executed first. Source file f1 itself is "NOT PRESENT", so it will go ahead and create f2 (Current implementation). We have this problem When rename falls in another brick and file is unlinked in Master. Similar kind of issue exists in following case too(multiple rename): CREATE f1 RENAME f1 f2 RENAME f2 f1 Solution: Instead of carrying out "changelogging" at "HASHED volume", carry out at the "CACHED volume". This way we have rename operations carried out where actual files are present. So,Changelog recorded as : @B1 CREATE f1 RENAME f1 f2 credit: sarumuga@redhat.com PS: Some of the races as the one below are _NOT_ fixed by this patch * f1 and f2 exist. B1 and B2 are their respective cached subvols. For both files hashed-subvol == cached-subvol * mv f1 f2 on master. * B1 has change-log entry of rename f1 f2 * rebalance migrates f2 from B1 and B2 * mv f2 f1 on master. * B2 has change-log entry of rename f2 f1 Since changelog entries (rename f1 f2) and (rename f2 f1) are processed independently by gsyncds, which of either f1 and f2 survives on slave is subject to race. Note that on master its file f1 with name f1 which survived. On slave it can be either file f1 with name f1 or file f2 with name f2 based on who wins the race of processing changelog. BUG: 1219412 Change-Id: I43725d69635e2ce065135691ef629014e8df7d50 Original-Author: Nithya Balachandran <nbalacha@redhat.com> Reviewed-on: http://review.gluster.org/10410 Signed-off-by: Saravanakumar Arumugam <sarumuga@redhat.com> Reviewed-on: http://review.gluster.org/10628 Tested-by: Gluster Build System <jenkins@build.gluster.com> Tested-by: NetBSD Build System Reviewed-by: Kotresh HR <khiremat@redhat.com> Reviewed-by: Raghavendra G <rgowdapp@redhat.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r--libglusterfs/src/common-utils.h25
-rw-r--r--libglusterfs/src/glusterfs.h1
-rw-r--r--xlators/cluster/dht/src/dht-rename.c82
3 files changed, 108 insertions, 0 deletions
diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h
index e8b5fc83591..c361405d5eb 100644
--- a/libglusterfs/src/common-utils.h
+++ b/libglusterfs/src/common-utils.h
@@ -130,6 +130,31 @@ enum _gf_xlator_ipc_targets {
typedef enum _gf_boolean gf_boolean_t;
typedef enum _gf_client_pid gf_client_pid_t;
typedef enum _gf_xlator_ipc_targets _gf_xlator_ipc_targets_t;
+
+/* The DHT file rename operation is not a straightforward rename.
+ * It involves creating linkto and linkfiles, and can unlink or rename the
+ * source file depending on the hashed and cached subvols for the source
+ * and target files. this makes it difficult for geo-rep to figure out that
+ * a rename operation has taken place.
+ *
+ * We now send a special key and the values of the source and target pargfids
+ * and basenames to indicate to changelog that the operation in question
+ * should be treated as a rename. We are explicitly filling and sending this
+ * as a binary value in the dictionary as the unlink op will not have the
+ * source file information. The lengths of the src and target basenames
+ * are used to calculate where to start reading the names in the structure.
+ * XFS allows a max of 255 chars for filenames but other file systems might
+ * not have such restrictions
+ */
+typedef struct dht_changelog_rename_info {
+ uuid_t old_pargfid;
+ uuid_t new_pargfid;
+ int32_t oldname_len;
+ int32_t newname_len;
+ char buffer[1];
+ } dht_changelog_rename_info_t;
+
+
typedef int (*gf_cmp) (void *, void *);
void gf_global_variable_init(void);
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index d8d92ad7546..1b7041cab45 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -140,6 +140,7 @@
#define GLUSTERFS_VERSION_XCHG_KEY "glusterfs.version.xchg"
#define GLUSTERFS_INTERNAL_FOP_KEY "glusterfs-internal-fop"
+#define DHT_CHANGELOG_RENAME_OP_KEY "changelog.rename-op"
#define ZR_FILE_CONTENT_STR "glusterfs.file."
#define ZR_FILE_CONTENT_STRLEN 15
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
index 0594945203e..4e4e9869685 100644
--- a/xlators/cluster/dht/src/dht-rename.c
+++ b/xlators/cluster/dht/src/dht-rename.c
@@ -319,6 +319,56 @@ err:
NULL, NULL);
return 0;
}
+
+
+
+static int
+dht_rename_track_for_changelog (xlator_t *this, dict_t *xattr,
+ loc_t *oldloc, loc_t *newloc)
+{
+ int ret = -1;
+ dht_changelog_rename_info_t *info = NULL;
+ char *name = NULL;
+ int len1 = 0;
+ int len2 = 0;
+ int size = 0;
+
+ if (!xattr || !oldloc || !newloc || !this)
+ return ret;
+
+ len1 = strlen (oldloc->name) + 1;
+ len2 = strlen (newloc->name) + 1;
+ size = sizeof (dht_changelog_rename_info_t) + len1 + len2;
+
+ info = GF_CALLOC (size, sizeof(char), gf_common_mt_char);
+ if (!info) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_DICT_SET_FAILED,
+ "Failed to calloc memory");
+ return ret;
+ }
+
+ gf_uuid_copy (info->old_pargfid, oldloc->pargfid);
+ gf_uuid_copy (info->new_pargfid, newloc->pargfid);
+
+ info->oldname_len = len1;
+ info->newname_len = len2;
+ strncpy (info->buffer, oldloc->name, len1);
+ name = info->buffer + len1;
+ strncpy (name, newloc->name, len2);
+
+ ret = dict_set_bin (xattr, DHT_CHANGELOG_RENAME_OP_KEY,
+ info, size);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value: key = %s,"
+ " path = %s", DHT_CHANGELOG_RENAME_OP_KEY,
+ oldloc->name);
+ }
+ return ret;
+}
+
+
#define DHT_MARK_FOP_INTERNAL(xattr) do { \
int tmp = -1; \
if (!xattr) { \
@@ -354,6 +404,32 @@ err:
} \
}while (0)
+
+#define DHT_CHANGELOG_TRACK_AS_RENAME(xattr, oldloc, newloc) do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new (); \
+ if (!xattr) { \
+ gf_msg (this->name, GF_LOG_ERROR, 0, \
+ DHT_MSG_DICT_SET_FAILED, \
+ "Failed to create dictionary to " \
+ "track rename"); \
+ break; \
+ } \
+ } \
+ \
+ tmp = dht_rename_track_for_changelog (this, xattr, \
+ oldloc, newloc); \
+ \
+ if (tmp) { \
+ gf_msg (this->name, GF_LOG_ERROR, 0, \
+ DHT_MSG_DICT_SET_FAILED, \
+ "Failed to set dictionary value: key = %s," \
+ " path = %s", DHT_CHANGELOG_RENAME_OP_KEY, \
+ (oldloc)->path); \
+ } \
+ } while (0)
+
int
dht_rename_unlock_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
@@ -745,6 +821,8 @@ err:
DHT_MARKER_DONT_ACCOUNT(xattr_new);
}
+ DHT_CHANGELOG_TRACK_AS_RENAME(xattr_new, &local->loc,
+ &local->loc2);
STACK_WIND (frame, dht_rename_unlink_cbk,
src_cached, src_cached->fops->unlink,
&local->loc, 0, xattr_new);
@@ -831,6 +909,10 @@ dht_do_rename (call_frame_t *frame)
DHT_MARKER_DONT_ACCOUNT(dict);
}
+ if (rename_subvol == src_cached) {
+ DHT_CHANGELOG_TRACK_AS_RENAME(dict, &local->loc, &local->loc2);
+ }
+
gf_msg_trace (this->name, 0,
"renaming %s => %s (%s)",
local->loc.path, local->loc2.path, rename_subvol->name);