From dbaaacf720baedc7f94b3acb6a479db394f54f57 Mon Sep 17 00:00:00 2001 From: Nithya Balachandran Date: Mon, 27 Apr 2015 21:18:10 +0530 Subject: geo-rep: rename handling in dht volume Background: Glusterfs changelogs are stored in each brick, which records the changes happened in that brick. Georep will run in all the nodes of master and processes changelogs "independently". Processing changelogs is in brick level, but all the fops will be replayed on "slave mount" point. Problem: With a DHT volume, in changelog "internal fops" are NOT recorded. For Rename case, Rename is recorded in "hashed" brick changelog. (DHT's internal fops like creating linkto file, unlink is NOT recorded). This lead us to inconsistent rename operations. For example, Distribute volume created with Two bricks B1, B2. //Consider master volume mounted @ /mnt/master and following operations executed: cd /mnt/master touch f1 // f1 falls on B1 Hash mv f1 f2 // f2 falls on B2 Hash // Here, Changelogs are recorded as below: @B1 CREATE f1 @B2 RENAME f1 f2 Here, race exists between Brick B1 and B2, say B2 will get executed first. Source file f1 itself is "NOT PRESENT", so it will go ahead and create f2 (Current implementation). We have this problem When rename falls in another brick and file is unlinked in Master. Similar kind of issue exists in following case too(multiple rename): CREATE f1 RENAME f1 f2 RENAME f2 f1 Solution: Instead of carrying out "changelogging" at "HASHED volume", carry out at the "CACHED volume". This way we have rename operations carried out where actual files are present. So,Changelog recorded as : @B1 CREATE f1 RENAME f1 f2 credit: sarumuga@redhat.com PS: Some of the races as the one below are _NOT_ fixed by this patch * f1 and f2 exist. B1 and B2 are their respective cached subvols. For both files hashed-subvol == cached-subvol * mv f1 f2 on master. * B1 has change-log entry of rename f1 f2 * rebalance migrates f2 from B1 and B2 * mv f2 f1 on master. * B2 has change-log entry of rename f2 f1 Since changelog entries (rename f1 f2) and (rename f2 f1) are processed independently by gsyncds, which of either f1 and f2 survives on slave is subject to race. Note that on master its file f1 with name f1 which survived. On slave it can be either file f1 with name f1 or file f2 with name f2 based on who wins the race of processing changelog. Change-Id: Iebc222f582613924c3a7cba37fb6d3e2d8332eda BUG: 1141379 Signed-off-by: Nithya Balachandran Reviewed-on: http://review.gluster.org/10410 Tested-by: Gluster Build System Tested-by: NetBSD Build System Reviewed-by: Raghavendra G Tested-by: Raghavendra G --- xlators/cluster/dht/src/dht-rename.c | 82 ++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) (limited to 'xlators') diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c index 0594945203e..4e4e9869685 100644 --- a/xlators/cluster/dht/src/dht-rename.c +++ b/xlators/cluster/dht/src/dht-rename.c @@ -319,6 +319,56 @@ err: NULL, NULL); return 0; } + + + +static int +dht_rename_track_for_changelog (xlator_t *this, dict_t *xattr, + loc_t *oldloc, loc_t *newloc) +{ + int ret = -1; + dht_changelog_rename_info_t *info = NULL; + char *name = NULL; + int len1 = 0; + int len2 = 0; + int size = 0; + + if (!xattr || !oldloc || !newloc || !this) + return ret; + + len1 = strlen (oldloc->name) + 1; + len2 = strlen (newloc->name) + 1; + size = sizeof (dht_changelog_rename_info_t) + len1 + len2; + + info = GF_CALLOC (size, sizeof(char), gf_common_mt_char); + if (!info) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_DICT_SET_FAILED, + "Failed to calloc memory"); + return ret; + } + + gf_uuid_copy (info->old_pargfid, oldloc->pargfid); + gf_uuid_copy (info->new_pargfid, newloc->pargfid); + + info->oldname_len = len1; + info->newname_len = len2; + strncpy (info->buffer, oldloc->name, len1); + name = info->buffer + len1; + strncpy (name, newloc->name, len2); + + ret = dict_set_bin (xattr, DHT_CHANGELOG_RENAME_OP_KEY, + info, size); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, + "Failed to set dictionary value: key = %s," + " path = %s", DHT_CHANGELOG_RENAME_OP_KEY, + oldloc->name); + } + return ret; +} + + #define DHT_MARK_FOP_INTERNAL(xattr) do { \ int tmp = -1; \ if (!xattr) { \ @@ -354,6 +404,32 @@ err: } \ }while (0) + +#define DHT_CHANGELOG_TRACK_AS_RENAME(xattr, oldloc, newloc) do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new (); \ + if (!xattr) { \ + gf_msg (this->name, GF_LOG_ERROR, 0, \ + DHT_MSG_DICT_SET_FAILED, \ + "Failed to create dictionary to " \ + "track rename"); \ + break; \ + } \ + } \ + \ + tmp = dht_rename_track_for_changelog (this, xattr, \ + oldloc, newloc); \ + \ + if (tmp) { \ + gf_msg (this->name, GF_LOG_ERROR, 0, \ + DHT_MSG_DICT_SET_FAILED, \ + "Failed to set dictionary value: key = %s," \ + " path = %s", DHT_CHANGELOG_RENAME_OP_KEY, \ + (oldloc)->path); \ + } \ + } while (0) + int dht_rename_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, @@ -745,6 +821,8 @@ err: DHT_MARKER_DONT_ACCOUNT(xattr_new); } + DHT_CHANGELOG_TRACK_AS_RENAME(xattr_new, &local->loc, + &local->loc2); STACK_WIND (frame, dht_rename_unlink_cbk, src_cached, src_cached->fops->unlink, &local->loc, 0, xattr_new); @@ -831,6 +909,10 @@ dht_do_rename (call_frame_t *frame) DHT_MARKER_DONT_ACCOUNT(dict); } + if (rename_subvol == src_cached) { + DHT_CHANGELOG_TRACK_AS_RENAME(dict, &local->loc, &local->loc2); + } + gf_msg_trace (this->name, 0, "renaming %s => %s (%s)", local->loc.path, local->loc2.path, rename_subvol->name); -- cgit