diff options
author | Kotresh HR <khiremat@redhat.com> | 2018-07-15 15:03:40 -0400 |
---|---|---|
committer | Kotresh HR <khiremat@redhat.com> | 2018-07-20 12:58:04 +0000 |
commit | 8d7be4ac0cf5ace9de3f340f411f1aed21f96414 (patch) | |
tree | 30c3be6612228dffba88db010e2a5f119e6c7dbe /geo-replication/syncdaemon/master.py | |
parent | ea4964df6f173b17eaf4e9048f55cfe969907663 (diff) |
geo-rep: Fix issues with gfid conflict handling
1. MKDIR/RMDIR is recorded on all bricks. So if
one brick succeeds creating it, other bricks
should ignore it. But this was not happening.
The fix rename of directories in hybrid crawl,
was trying to rename the directory to itself
and in the process crashing with ENOENT if the
directory is removed.
2. If file is created, deleted and a directory is
created with same name, it was failing to sync.
Again the issue is around the fix for rename
of directories in hybrid crawl. Fixed the same.
If the same case was done with hardlink present
for the file, it was failing. This patch fixes
that too.
fixes: bz#1598884
Change-Id: I6f3bca44e194e415a3d4de3b9d03cc8976439284
Signed-off-by: Kotresh HR <khiremat@redhat.com>
Diffstat (limited to 'geo-replication/syncdaemon/master.py')
-rw-r--r-- | geo-replication/syncdaemon/master.py | 141 |
1 files changed, 100 insertions, 41 deletions
diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py index d9f63a440fb..665a51f64dd 100644 --- a/geo-replication/syncdaemon/master.py +++ b/geo-replication/syncdaemon/master.py @@ -711,7 +711,8 @@ class GMasterChangelogMixin(GMasterCommon): TYPE_GFID = "D " TYPE_ENTRY = "E " - MAX_EF_RETRIES = 15 + MAX_EF_RETRIES = 10 + MAX_OE_RETRIES = 5 # flat directory hierarchy for gfid based access FLAT_DIR_HIERARCHY = '.' @@ -803,21 +804,28 @@ class GMasterChangelogMixin(GMasterCommon): self.status.inc_value("failures", num_failures) - def fix_possible_entry_failures(self, failures, retry_count): + def fix_possible_entry_failures(self, failures, retry_count, entries): pfx = gauxpfx() fix_entry_ops = [] failures1 = [] for failure in failures: - if failure[2]['dst']: + if failure[2]['name_mismatch']: + pbname = failure[2]['slave_entry'] + elif failure[2]['dst']: pbname = failure[0]['entry1'] else: pbname = failure[0]['entry'] - if failure[2]['gfid_mismatch']: + + op = failure[0]['op'] + # name exists but gfid is different + if failure[2]['gfid_mismatch'] or failure[2]['name_mismatch']: slave_gfid = failure[2]['slave_gfid'] st = lstat(os.path.join(pfx, slave_gfid)) + # Takes care of scenarios with no hardlinks if isinstance(st, int) and st == ENOENT: - logging.info(lf('Fixing gfid mismatch in slave. Deleting' - ' the entry', retry_count=retry_count, + logging.info(lf('Entry not present on master. Fixing gfid ' + 'mismatch in slave. Deleting the entry', + retry_count=retry_count, entry=repr(failure))) # Add deletion to fix_entry_ops list if failure[2]['slave_isdir']: @@ -830,79 +838,119 @@ class GMasterChangelogMixin(GMasterCommon): edct('UNLINK', gfid=failure[2]['slave_gfid'], entry=pbname)) + # Takes care of scenarios of hardlinks/renames on master elif not isinstance(st, int): - # The file exists on master but with different name. - # Probabaly renamed and got missed during xsync crawl. - if failure[2]['slave_isdir']: - logging.info(lf('Fixing gfid mismatch in slave', + if matching_disk_gfid(slave_gfid, pbname): + # Safe to ignore the failure as master contains same + # file with same gfid. Remove entry from entries list + logging.info(lf('Fixing gfid mismatch in slave. ' + ' Safe to ignore, take out entry', retry_count=retry_count, entry=repr(failure))) - realpath = os.readlink(os.path.join( - rconf.args.local_path, - ".glusterfs", - slave_gfid[0:2], - slave_gfid[2:4], - slave_gfid)) + entries.remove(failure[0]) + # The file exists on master but with different name. + # Probably renamed and got missed during xsync crawl. + elif failure[2]['slave_isdir']: + realpath = os.readlink(os.path.join(gconf.local_path, + ".glusterfs", + slave_gfid[0:2], + slave_gfid[2:4], + slave_gfid)) dst_entry = os.path.join(pfx, realpath.split('/')[-2], realpath.split('/')[-1]) - rename_dict = edct('RENAME', gfid=slave_gfid, - entry=failure[0]['entry'], - entry1=dst_entry, stat=st, - link=None) - logging.info(lf('Fixing gfid mismatch in slave. ' - 'Renaming', retry_count=retry_count, - entry=repr(rename_dict))) - fix_entry_ops.append(rename_dict) + src_entry = pbname + logging.info(lf('Fixing dir name/gfid mismatch in ' + 'slave', retry_count=retry_count, + entry=repr(failure))) + if src_entry == dst_entry: + # Safe to ignore the failure as master contains + # same directory as in slave with same gfid. + # Remove the failure entry from entries list + logging.info(lf('Fixing dir name/gfid mismatch' + ' in slave. Safe to ignore, ' + 'take out entry', + retry_count=retry_count, + entry=repr(failure))) + entries.remove(failure[0]) + else: + rename_dict = edct('RENAME', gfid=slave_gfid, + entry=src_entry, + entry1=dst_entry, stat=st, + link=None) + logging.info(lf('Fixing dir name/gfid mismatch' + ' in slave. Renaming', + retry_count=retry_count, + entry=repr(rename_dict))) + fix_entry_ops.append(rename_dict) else: - logging.info(lf('Fixing gfid mismatch in slave. ' - ' Deleting the entry', + # A hardlink file exists with different name or + # renamed file exists and we are sure from + # matching_disk_gfid check that the entry doesn't + # exist with same gfid so we can safely delete on slave + logging.info(lf('Fixing file gfid mismatch in slave. ' + 'Hardlink/Rename Case. Deleting entry', retry_count=retry_count, entry=repr(failure))) fix_entry_ops.append( edct('UNLINK', gfid=failure[2]['slave_gfid'], entry=pbname)) - logging.error(lf('Entry cannot be fixed in slave due ' - 'to GFID mismatch, find respective ' - 'path for the GFID and trigger sync', - gfid=slave_gfid)) + elif failure[1] == ENOENT: + # Ignore ENOENT error for fix_entry_ops aka retry_count > 1 + if retry_count > 1: + logging.info(lf('ENOENT error while fixing entry ops. ' + 'Safe to ignore, take out entry', + retry_count=retry_count, + entry=repr(failure))) + entries.remove(failure[0]) + elif op in ('MKNOD', 'CREATE', 'MKDIR'): + pargfid = pbname.split('/')[1] + st = lstat(os.path.join(pfx, pargfid)) + # Safe to ignore the failure as master doesn't contain + # parent directory. + if isinstance(st, int): + logging.info(lf('Fixing ENOENT error in slave. Parent ' + 'does not exist on master. Safe to ' + 'ignore, take out entry', + retry_count=retry_count, + entry=repr(failure))) + entries.remove(failure[0]) if fix_entry_ops: # Process deletions of entries whose gfids are mismatched failures1 = self.slave.server.entry_ops(fix_entry_ops) - if not failures1: - logging.info("Sucessfully fixed entry ops with gfid mismatch") - return failures1 + return (failures1, fix_entry_ops) def handle_entry_failures(self, failures, entries): retries = 0 pending_failures = False failures1 = [] failures2 = [] + entry_ops1 = [] + entry_ops2 = [] if failures: pending_failures = True failures1 = failures + entry_ops1 = entries while pending_failures and retries < self.MAX_EF_RETRIES: retries += 1 - failures2 = self.fix_possible_entry_failures(failures1, - retries) + (failures2, entry_ops2) = self.fix_possible_entry_failures( + failures1, retries, entry_ops1) if not failures2: pending_failures = False + logging.info(lf('Sucessfully fixed entry ops with gfid ' + 'mismatch', retry_count=retries)) else: pending_failures = True failures1 = failures2 + entry_ops1 = entry_ops2 if pending_failures: for failure in failures1: logging.error("Failed to fix entry ops %s", repr(failure)) - else: - # Retry original entry list 5 times - failures = self.slave.server.entry_ops(entries) - - self.log_failures(failures, 'gfid', gauxpfx(), 'ENTRY') def process_change(self, change, done, retry): pfx = gauxpfx() @@ -1129,7 +1177,18 @@ class GMasterChangelogMixin(GMasterCommon): self.status.inc_value("entry", len(entries)) failures = self.slave.server.entry_ops(entries) - self.handle_entry_failures(failures, entries) + count = 0 + while failures and count < self.MAX_OE_RETRIES: + count += 1 + self.handle_entry_failures(failures, entries) + logging.info("Retry original entries. count = %s" % count) + failures = self.slave.server.entry_ops(entries) + if not failures: + logging.info("Sucessfully fixed all entry ops with gfid " + "mismatch") + break + + self.log_failures(failures, 'gfid', gauxpfx(), 'ENTRY') self.status.dec_value("entry", len(entries)) # Update Entry stime in Brick Root only in case of Changelog mode |